From 9c45824e78754e06f392053aac2460b1988787a1 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 18 Feb 2025 20:13:09 +0800
Subject: [PATCH 1/5] Optimize file handling for DOCX and PPTX processing.

- Removed redundant file content reading.
- Directly passed file to BytesIO.
- Simplified DOCX content extraction.
- Streamlined PPTX slide processing.
- Reduced memory usage in file handling.
---
 lightrag/api/lightrag_server.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index fba81086..f9420153 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -1343,8 +1343,7 @@ def create_app(args):
                     from docx import Document
                     from io import BytesIO
 
-                    docx_content = await file.read()
-                    docx_file = BytesIO(docx_content)
+                    docx_file = BytesIO(file)
                     doc = Document(docx_file)
                     content = "\n".join(
                         [paragraph.text for paragraph in doc.paragraphs]
@@ -1355,8 +1354,7 @@ def create_app(args):
                     from pptx import Presentation  # type: ignore
                     from io import BytesIO
 
-                    pptx_content = await file.read()
-                    pptx_file = BytesIO(pptx_content)
+                    pptx_file = BytesIO(file)
                     prs = Presentation(pptx_file)
                     for slide in prs.slides:
                         for shape in slide.shapes:

From cac93424d90fd38019c9dfb9957fa4b170ff8355 Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 18 Feb 2025 20:17:42 +0800
Subject: [PATCH 2/5] Added support for reading .xlsx files in LightRAG.

- Install openpyxl if not present
- Load .xlsx file using openpyxl
- Extract sheet titles and content
- Format rows with tab-separated values
- Append sheet content to overall text
---
 lightrag/api/lightrag_server.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index f9420153..36403241 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -1360,6 +1360,19 @@ def create_app(args):
                         for shape in slide.shapes:
                             if hasattr(shape, "text"):
                                 content += shape.text + "\n"
+                case ".xlsx":
+                    if not pm.is_installed("openpyxl"):
+                        pm.install("openpyxl")
+                    from openpyxl import load_workbook
+                    from io import BytesIO
+
+                    xlsx_file = BytesIO(file)
+                    wb = load_workbook(xlsx_file)
+                    for sheet in wb:
+                        content += f"Sheet: {sheet.title}\n"
+                        for row in sheet.iter_rows(values_only=True):
+                            content += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
+                        content += "\n"
                 case _:
                     logging.error(
                         f"Unsupported file type: {file_path.name} (extension {ext})"

From d34dbc5717dc20fda2d77e2b348cbd06767fe3bf Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Tue, 18 Feb 2025 23:52:53 +0800
Subject: [PATCH 3/5] Improved document enqueue logic with existence checks.

- Added return status to `apipeline_enqueue_documents`
- Enhanced logging for duplicate documents
---
 lightrag/api/lightrag_server.py | 14 +++++++-------
 lightrag/lightrag.py            |  5 +++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 36403241..699a961f 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -1381,15 +1381,15 @@ def create_app(args):
 
             # Insert into the RAG queue
             if content:
-                await rag.apipeline_enqueue_documents(content)
-                logging.info(
-                    f"Successfully processed and enqueued file: {file_path.name}"
-                )
+                has_new_docs = await rag.apipeline_enqueue_documents(content)
+                if has_new_docs:
+                    logging.info(f"Successfully processed and enqueued file: {file_path.name}")
+                else:
+                    logging.info(f"File content already exists, skipping: {file_path.name}")
                 return True
             else:
-                logging.error(
-                    f"No content could be extracted from file: {file_path.name}"
-                )
+                logging.error(f"No content could be extracted from file: {file_path.name}")
+                return False
 
         except Exception as e:
             logging.error(
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index 09a8df3f..ea45929e 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -653,7 +653,7 @@ class LightRAG:
             if update_storage:
                 await self._insert_done()
 
-    async def apipeline_enqueue_documents(self, input: str | list[str]):
+    async def apipeline_enqueue_documents(self, input: str | list[str]) -> bool:
         """
         Pipeline for Processing Documents
 
@@ -691,11 +691,12 @@ class LightRAG:
 
         if not new_docs:
             logger.info("No new unique documents were found.")
-            return
+            return False
 
         # 4. Store status document
         await self.doc_status.upsert(new_docs)
         logger.info(f"Stored {len(new_docs)} new unique documents")
+        return True
 
     async def apipeline_process_enqueue_documents(
         self,

From 8196df83f85a38a6790207af5236cf6f63b7eaba Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Wed, 19 Feb 2025 01:39:25 +0800
Subject: [PATCH 4/5] Fix linting

---
 lightrag/api/lightrag_server.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 699a961f..5f24ceba 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -1363,7 +1363,7 @@ def create_app(args):
                 case ".xlsx":
                     if not pm.is_installed("openpyxl"):
                         pm.install("openpyxl")
-                    from openpyxl import load_workbook
+                    from openpyxl import load_workbook  # type: ignore
                     from io import BytesIO
 
                     xlsx_file = BytesIO(file)
@@ -1371,7 +1371,13 @@ def create_app(args):
                     for sheet in wb:
                         content += f"Sheet: {sheet.title}\n"
                         for row in sheet.iter_rows(values_only=True):
-                            content += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
+                            content += (
+                                "\t".join(
+                                    str(cell) if cell is not None else ""
+                                    for cell in row
+                                )
+                                + "\n"
+                            )
                         content += "\n"
                 case _:
                     logging.error(
@@ -1383,12 +1389,18 @@ def create_app(args):
             if content:
                 has_new_docs = await rag.apipeline_enqueue_documents(content)
                 if has_new_docs:
-                    logging.info(f"Successfully processed and enqueued file: {file_path.name}")
+                    logging.info(
+                        f"Successfully processed and enqueued file: {file_path.name}"
+                    )
                 else:
-                    logging.info(f"File content already exists, skipping: {file_path.name}")
+                    logging.info(
+                        f"File content already exists, skipping: {file_path.name}"
+                    )
                 return True
             else:
-                logging.error(f"No content could be extracted from file: {file_path.name}")
+                logging.error(
+                    f"No content could be extracted from file: {file_path.name}"
+                )
                 return False
 
         except Exception as e:

From 32bfcbb33285cbe8cb00dfe76aea5c01b837a13e Mon Sep 17 00:00:00 2001
From: yangdx <yangdx@znipower.com>
Date: Wed, 19 Feb 2025 13:39:50 +0800
Subject: [PATCH 5/5] Revert "Improved document enqueue logic with existence
 checks"

---
 lightrag/api/lightrag_server.py | 14 ++++----------
 lightrag/lightrag.py            |  5 ++---
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py
index 5f24ceba..81437505 100644
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -1387,21 +1387,15 @@ def create_app(args):
 
             # Insert into the RAG queue
             if content:
-                has_new_docs = await rag.apipeline_enqueue_documents(content)
-                if has_new_docs:
-                    logging.info(
-                        f"Successfully processed and enqueued file: {file_path.name}"
-                    )
-                else:
-                    logging.info(
-                        f"File content already exists, skipping: {file_path.name}"
-                    )
+                await rag.apipeline_enqueue_documents(content)
+                logging.info(
+                    f"Successfully fetched and enqueued file: {file_path.name}"
+                )
                 return True
             else:
                 logging.error(
                     f"No content could be extracted from file: {file_path.name}"
                 )
-                return False
 
         except Exception as e:
             logging.error(
diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py
index ea45929e..09a8df3f 100644
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -653,7 +653,7 @@ class LightRAG:
             if update_storage:
                 await self._insert_done()
 
-    async def apipeline_enqueue_documents(self, input: str | list[str]) -> bool:
+    async def apipeline_enqueue_documents(self, input: str | list[str]):
         """
         Pipeline for Processing Documents
 
@@ -691,12 +691,11 @@ class LightRAG:
 
         if not new_docs:
             logger.info("No new unique documents were found.")
-            return False
+            return
 
         # 4. Store status document
         await self.doc_status.upsert(new_docs)
         logger.info(f"Stored {len(new_docs)} new unique documents")
-        return True
 
     async def apipeline_process_enqueue_documents(
         self,