From 9c45824e78754e06f392053aac2460b1988787a1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 18 Feb 2025 20:13:09 +0800 Subject: [PATCH 1/5] Optimize file handling for DOCX and PPTX processing. - Removed redundant file content reading. - Directly passed file to BytesIO. - Simplified DOCX content extraction. - Streamlined PPTX slide processing. - Reduced memory usage in file handling. --- lightrag/api/lightrag_server.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index fba81086..f9420153 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1343,8 +1343,7 @@ def create_app(args): from docx import Document from io import BytesIO - docx_content = await file.read() - docx_file = BytesIO(docx_content) + docx_file = BytesIO(file) doc = Document(docx_file) content = "\n".join( [paragraph.text for paragraph in doc.paragraphs] @@ -1355,8 +1354,7 @@ def create_app(args): from pptx import Presentation # type: ignore from io import BytesIO - pptx_content = await file.read() - pptx_file = BytesIO(pptx_content) + pptx_file = BytesIO(file) prs = Presentation(pptx_file) for slide in prs.slides: for shape in slide.shapes: From cac93424d90fd38019c9dfb9957fa4b170ff8355 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 18 Feb 2025 20:17:42 +0800 Subject: [PATCH 2/5] Added support for reading .xlsx files in LightRAG. - Install openpyxl if not present - Load .xlsx file using openpyxl - Extract sheet titles and content - Format rows with tab-separated values - Append sheet content to overall text --- lightrag/api/lightrag_server.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index f9420153..36403241 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1360,6 +1360,19 @@ def create_app(args): for shape in slide.shapes: if hasattr(shape, "text"): content += shape.text + "\n" + case ".xlsx": + if not pm.is_installed("openpyxl"): + pm.install("openpyxl") + from openpyxl import load_workbook + from io import BytesIO + + xlsx_file = BytesIO(file) + wb = load_workbook(xlsx_file) + for sheet in wb: + content += f"Sheet: {sheet.title}\n" + for row in sheet.iter_rows(values_only=True): + content += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n" + content += "\n" case _: logging.error( f"Unsupported file type: {file_path.name} (extension {ext})" From d34dbc5717dc20fda2d77e2b348cbd06767fe3bf Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 18 Feb 2025 23:52:53 +0800 Subject: [PATCH 3/5] Improved document enqueue logic with existence checks. - Added return status to `apipeline_enqueue_documents` - Enhanced logging for duplicate documents --- lightrag/api/lightrag_server.py | 14 +++++++------- lightrag/lightrag.py | 5 +++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 36403241..699a961f 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1381,15 +1381,15 @@ def create_app(args): # Insert into the RAG queue if content: - await rag.apipeline_enqueue_documents(content) - logging.info( - f"Successfully processed and enqueued file: {file_path.name}" - ) + has_new_docs = await rag.apipeline_enqueue_documents(content) + if has_new_docs: + logging.info(f"Successfully processed and enqueued file: {file_path.name}") + else: + logging.info(f"File content already exists, skipping: {file_path.name}") return True else: - logging.error( - f"No content could be extracted from file: {file_path.name}" - ) + logging.error(f"No content could be extracted from file: {file_path.name}") + return False except Exception as e: logging.error( diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 09a8df3f..ea45929e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -653,7 +653,7 @@ class LightRAG: if update_storage: await self._insert_done() - async def apipeline_enqueue_documents(self, input: str | list[str]): + async def apipeline_enqueue_documents(self, input: str | list[str]) -> bool: """ Pipeline for Processing Documents @@ -691,11 +691,12 @@ class LightRAG: if not new_docs: logger.info("No new unique documents were found.") - return + return False # 4. Store status document await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") + return True async def apipeline_process_enqueue_documents( self, From 8196df83f85a38a6790207af5236cf6f63b7eaba Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Feb 2025 01:39:25 +0800 Subject: [PATCH 4/5] Fix linting --- lightrag/api/lightrag_server.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 699a961f..5f24ceba 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1363,7 +1363,7 @@ def create_app(args): case ".xlsx": if not pm.is_installed("openpyxl"): pm.install("openpyxl") - from openpyxl import load_workbook + from openpyxl import load_workbook # type: ignore from io import BytesIO xlsx_file = BytesIO(file) @@ -1371,7 +1371,13 @@ def create_app(args): for sheet in wb: content += f"Sheet: {sheet.title}\n" for row in sheet.iter_rows(values_only=True): - content += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n" + content += ( + "\t".join( + str(cell) if cell is not None else "" + for cell in row + ) + + "\n" + ) content += "\n" case _: logging.error( @@ -1383,12 +1389,18 @@ def create_app(args): if content: has_new_docs = await rag.apipeline_enqueue_documents(content) if has_new_docs: - logging.info(f"Successfully processed and enqueued file: {file_path.name}") + logging.info( + f"Successfully processed and enqueued file: {file_path.name}" + ) else: - logging.info(f"File content already exists, skipping: {file_path.name}") + logging.info( + f"File content already exists, skipping: {file_path.name}" + ) return True else: - logging.error(f"No content could be extracted from file: {file_path.name}") + logging.error( + f"No content could be extracted from file: {file_path.name}" + ) return False except Exception as e: From 32bfcbb33285cbe8cb00dfe76aea5c01b837a13e Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Feb 2025 13:39:50 +0800 Subject: [PATCH 5/5] Revert "Improved document enqueue logic with existence checks" --- lightrag/api/lightrag_server.py | 14 ++++---------- lightrag/lightrag.py | 5 ++--- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 5f24ceba..81437505 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -1387,21 +1387,15 @@ def create_app(args): # Insert into the RAG queue if content: - has_new_docs = await rag.apipeline_enqueue_documents(content) - if has_new_docs: - logging.info( - f"Successfully processed and enqueued file: {file_path.name}" - ) - else: - logging.info( - f"File content already exists, skipping: {file_path.name}" - ) + await rag.apipeline_enqueue_documents(content) + logging.info( + f"Successfully fetched and enqueued file: {file_path.name}" + ) return True else: logging.error( f"No content could be extracted from file: {file_path.name}" ) - return False except Exception as e: logging.error( diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index ea45929e..09a8df3f 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -653,7 +653,7 @@ class LightRAG: if update_storage: await self._insert_done() - async def apipeline_enqueue_documents(self, input: str | list[str]) -> bool: + async def apipeline_enqueue_documents(self, input: str | list[str]): """ Pipeline for Processing Documents @@ -691,12 +691,11 @@ class LightRAG: if not new_docs: logger.info("No new unique documents were found.") - return False + return # 4. Store status document await self.doc_status.upsert(new_docs) logger.info(f"Stored {len(new_docs)} new unique documents") - return True async def apipeline_process_enqueue_documents( self,