diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..44c3aff1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include lightrag/api/webui * diff --git a/README.md b/README.md index 5e8c5a94..57563a1f 100644 --- a/README.md +++ b/README.md @@ -505,44 +505,58 @@ rag.query_with_separate_keyword_extraction( ```python custom_kg = { + "chunks": [ + { + "content": "Alice and Bob are collaborating on quantum computing research.", + "source_id": "doc-1" + } + ], "entities": [ { - "entity_name": "CompanyA", - "entity_type": "Organization", - "description": "A major technology company", - "source_id": "Source1" + "entity_name": "Alice", + "entity_type": "person", + "description": "Alice is a researcher specializing in quantum physics.", + "source_id": "doc-1" }, { - "entity_name": "ProductX", - "entity_type": "Product", - "description": "A popular product developed by CompanyA", - "source_id": "Source1" + "entity_name": "Bob", + "entity_type": "person", + "description": "Bob is a mathematician.", + "source_id": "doc-1" + }, + { + "entity_name": "Quantum Computing", + "entity_type": "technology", + "description": "Quantum computing utilizes quantum mechanical phenomena for computation.", + "source_id": "doc-1" } ], "relationships": [ { - "src_id": "CompanyA", - "tgt_id": "ProductX", - "description": "CompanyA develops ProductX", - "keywords": "develop, produce", + "src_id": "Alice", + "tgt_id": "Bob", + "description": "Alice and Bob are research partners.", + "keywords": "collaboration research", "weight": 1.0, - "source_id": "Source1" + "source_id": "doc-1" + }, + { + "src_id": "Alice", + "tgt_id": "Quantum Computing", + "description": "Alice conducts research on quantum computing.", + "keywords": "research expertise", + "weight": 1.0, + "source_id": "doc-1" + }, + { + "src_id": "Bob", + "tgt_id": "Quantum Computing", + "description": "Bob researches quantum computing.", + "keywords": "research application", + "weight": 1.0, + "source_id": "doc-1" } - ], - "chunks": [ - { - "content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.", - "source_id": "Source1", - }, - { - "content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.", - "source_id": "Source2", - }, - { - "content": "None", - "source_id": "UNKNOWN", - }, - ], + ] } rag.insert_custom_kg(custom_kg) @@ -655,16 +669,19 @@ setup_logger("lightrag", level="INFO") # Note: Default settings use NetworkX # Initialize LightRAG with Neo4J implementation. -rag = LightRAG( - working_dir=WORKING_DIR, - llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model - graph_storage="Neo4JStorage", #<-----------override KG default -) +async def initialize_rag(): + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model + graph_storage="Neo4JStorage", #<-----------override KG default + ) -# Initialize database connections -await rag.initialize_storages() -# Initialize pipeline status for document processing -await initialize_pipeline_status() + # Initialize database connections + await rag.initialize_storages() + # Initialize pipeline status for document processing + await initialize_pipeline_status() + + return rag ``` see test_neo4j.py for a working example. @@ -768,7 +785,8 @@ rag.delete_by_doc_id("doc_id") LightRAG now supports comprehensive knowledge graph management capabilities, allowing you to create, edit, and delete entities and relationships within your knowledge graph. -### Create Entities and Relations +
+ Create Entities and Relations ```python # Create new entity @@ -790,8 +808,10 @@ relation = rag.create_relation("Google", "Gmail", { "weight": 2.0 }) ``` +
-### Edit Entities and Relations +
+ Edit Entities and Relations ```python # Edit an existing entity @@ -813,6 +833,7 @@ updated_relation = rag.edit_relation("Google", "Google Mail", { "weight": 3.0 }) ``` +
All operations are available in both synchronous and asynchronous versions. The asynchronous versions have the prefix "a" (e.g., `acreate_entity`, `aedit_relation`). diff --git a/examples/lightrag_azure_openai_demo.py b/examples/lightrag_azure_openai_demo.py index e0840366..c101383d 100644 --- a/examples/lightrag_azure_openai_demo.py +++ b/examples/lightrag_azure_openai_demo.py @@ -81,34 +81,46 @@ asyncio.run(test_funcs()) embedding_dimension = 3072 -rag = LightRAG( - working_dir=WORKING_DIR, - llm_model_func=llm_model_func, - embedding_func=EmbeddingFunc( - embedding_dim=embedding_dimension, - max_token_size=8192, - func=embedding_func, - ), -) -rag.initialize_storages() -initialize_pipeline_status() +async def initialize_rag(): + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=8192, + func=embedding_func, + ), + ) -book1 = open("./book_1.txt", encoding="utf-8") -book2 = open("./book_2.txt", encoding="utf-8") + await rag.initialize_storages() + await initialize_pipeline_status() -rag.insert([book1.read(), book2.read()]) + return rag -query_text = "What are the main themes?" -print("Result (Naive):") -print(rag.query(query_text, param=QueryParam(mode="naive"))) +def main(): + rag = asyncio.run(initialize_rag()) -print("\nResult (Local):") -print(rag.query(query_text, param=QueryParam(mode="local"))) + book1 = open("./book_1.txt", encoding="utf-8") + book2 = open("./book_2.txt", encoding="utf-8") -print("\nResult (Global):") -print(rag.query(query_text, param=QueryParam(mode="global"))) + rag.insert([book1.read(), book2.read()]) -print("\nResult (Hybrid):") -print(rag.query(query_text, param=QueryParam(mode="hybrid"))) + query_text = "What are the main themes?" + + print("Result (Naive):") + print(rag.query(query_text, param=QueryParam(mode="naive"))) + + print("\nResult (Local):") + print(rag.query(query_text, param=QueryParam(mode="local"))) + + print("\nResult (Global):") + print(rag.query(query_text, param=QueryParam(mode="global"))) + + print("\nResult (Hybrid):") + print(rag.query(query_text, param=QueryParam(mode="hybrid"))) + + +if __name__ == "__main__": + main() diff --git a/examples/lightrag_bedrock_demo.py b/examples/lightrag_bedrock_demo.py index 68e9f962..c7f41677 100644 --- a/examples/lightrag_bedrock_demo.py +++ b/examples/lightrag_bedrock_demo.py @@ -53,3 +53,7 @@ def main(): "What are the top themes in this story?", param=QueryParam(mode=mode) ) ) + + +if __name__ == "__main__": + main() diff --git a/examples/lightrag_nvidia_demo.py b/examples/lightrag_nvidia_demo.py index 6de0814c..0e9259bc 100644 --- a/examples/lightrag_nvidia_demo.py +++ b/examples/lightrag_nvidia_demo.py @@ -125,7 +125,7 @@ async def initialize_rag(): async def main(): try: # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() # reading file with open("./book.txt", "r", encoding="utf-8") as f: diff --git a/examples/lightrag_openai_compatible_demo.py b/examples/lightrag_openai_compatible_demo.py index 1c4a7a92..d26a8de3 100644 --- a/examples/lightrag_openai_compatible_demo.py +++ b/examples/lightrag_openai_compatible_demo.py @@ -77,7 +77,7 @@ async def initialize_rag(): async def main(): try: # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() with open("./book.txt", "r", encoding="utf-8") as f: await rag.ainsert(f.read()) diff --git a/examples/lightrag_openai_compatible_demo_embedding_cache.py b/examples/lightrag_openai_compatible_demo_embedding_cache.py index 85408f3b..4638219f 100644 --- a/examples/lightrag_openai_compatible_demo_embedding_cache.py +++ b/examples/lightrag_openai_compatible_demo_embedding_cache.py @@ -81,7 +81,7 @@ async def initialize_rag(): async def main(): try: # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() with open("./book.txt", "r", encoding="utf-8") as f: await rag.ainsert(f.read()) diff --git a/examples/lightrag_oracle_demo.py b/examples/lightrag_oracle_demo.py index 420f1af0..6663f6a1 100644 --- a/examples/lightrag_oracle_demo.py +++ b/examples/lightrag_oracle_demo.py @@ -107,7 +107,7 @@ async def initialize_rag(): async def main(): try: # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() # Extract and Insert into LightRAG storage with open(WORKING_DIR + "/docs.txt", "r", encoding="utf-8") as f: diff --git a/examples/lightrag_tidb_demo.py b/examples/lightrag_tidb_demo.py index f167e9cc..52695560 100644 --- a/examples/lightrag_tidb_demo.py +++ b/examples/lightrag_tidb_demo.py @@ -87,7 +87,7 @@ async def initialize_rag(): async def main(): try: # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() with open("./book.txt", "r", encoding="utf-8") as f: rag.insert(f.read()) diff --git a/examples/lightrag_zhipu_postgres_demo.py b/examples/lightrag_zhipu_postgres_demo.py index 304c5f2c..e4a20f26 100644 --- a/examples/lightrag_zhipu_postgres_demo.py +++ b/examples/lightrag_zhipu_postgres_demo.py @@ -59,7 +59,7 @@ async def initialize_rag(): async def main(): # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() # add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func diff --git a/examples/query_keyword_separation_example.py b/examples/query_keyword_separation_example.py index cbfdd930..092330f4 100644 --- a/examples/query_keyword_separation_example.py +++ b/examples/query_keyword_separation_example.py @@ -102,7 +102,7 @@ async def initialize_rag(): # Example function demonstrating the new query_with_separate_keyword_extraction usage async def run_example(): # Initialize RAG instance - rag = asyncio.run(initialize_rag()) + rag = await initialize_rag() book1 = open("./book_1.txt", encoding="utf-8") book2 = open("./book_2.txt", encoding="utf-8") diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index c91f693f..eddeaa5c 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -6,7 +6,6 @@ from fastapi import ( FastAPI, Depends, ) -from fastapi.responses import FileResponse import asyncio import os import logging @@ -408,10 +407,6 @@ def create_app(args): name="webui", ) - @app.get("/webui/") - async def webui_root(): - return FileResponse(static_dir / "index.html") - return app diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index ab5aff96..d9dfe913 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -215,9 +215,29 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: | ".scss" | ".less" ): - content = file.decode("utf-8") + try: + # Try to decode as UTF-8 + content = file.decode("utf-8") + + # Validate content + if not content or len(content.strip()) == 0: + logger.error(f"Empty content in file: {file_path.name}") + return False + + # Check if content looks like binary data string representation + if content.startswith("b'") or content.startswith('b"'): + logger.error( + f"File {file_path.name} appears to contain binary data representation instead of text" + ) + return False + + except UnicodeDecodeError: + logger.error( + f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing." + ) + return False case ".pdf": - if not pm.is_installed("pypdf2"): + if not pm.is_installed("pypdf2"): # type: ignore pm.install("pypdf2") from PyPDF2 import PdfReader # type: ignore from io import BytesIO @@ -227,18 +247,18 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: for page in reader.pages: content += page.extract_text() + "\n" case ".docx": - if not pm.is_installed("docx"): + if not pm.is_installed("python-docx"): # type: ignore pm.install("docx") - from docx import Document + from docx import Document # type: ignore from io import BytesIO docx_file = BytesIO(file) doc = Document(docx_file) content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) case ".pptx": - if not pm.is_installed("pptx"): + if not pm.is_installed("python-pptx"): # type: ignore pm.install("pptx") - from pptx import Presentation + from pptx import Presentation # type: ignore from io import BytesIO pptx_file = BytesIO(file) @@ -248,9 +268,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: if hasattr(shape, "text"): content += shape.text + "\n" case ".xlsx": - if not pm.is_installed("openpyxl"): + if not pm.is_installed("openpyxl"): # type: ignore pm.install("openpyxl") - from openpyxl import load_workbook + from openpyxl import load_workbook # type: ignore from io import BytesIO xlsx_file = BytesIO(file) diff --git a/lightrag/kg/json_kv_impl.py b/lightrag/kg/json_kv_impl.py index 8d707899..c0b61a63 100644 --- a/lightrag/kg/json_kv_impl.py +++ b/lightrag/kg/json_kv_impl.py @@ -44,6 +44,15 @@ class JsonKVStorage(BaseKVStorage): ) write_json(data_dict, self._file_name) + async def get_all(self) -> dict[str, Any]: + """Get all data from storage + + Returns: + Dictionary containing all stored data + """ + async with self._storage_lock: + return dict(self._data) + async def get_by_id(self, id: str) -> dict[str, Any] | None: async with self._storage_lock: return self._data.get(id) diff --git a/lightrag/kg/tidb_impl.py b/lightrag/kg/tidb_impl.py index 4adb0141..51d1c365 100644 --- a/lightrag/kg/tidb_impl.py +++ b/lightrag/kg/tidb_impl.py @@ -174,6 +174,14 @@ class TiDBKVStorage(BaseKVStorage): self.db = None ################ QUERY METHODS ################ + async def get_all(self) -> dict[str, Any]: + """Get all data from storage + + Returns: + Dictionary containing all stored data + """ + async with self._storage_lock: + return dict(self._data) async def get_by_id(self, id: str) -> dict[str, Any] | None: """Fetch doc_full data by id.""" diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 27fdafda..a36934e8 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -689,8 +689,24 @@ class LightRAG: all_new_doc_ids = set(new_docs.keys()) # Exclude IDs of documents that are already in progress unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids) + + # Log ignored document IDs + ignored_ids = [ + doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs + ] + if ignored_ids: + logger.warning( + f"Ignoring {len(ignored_ids)} document IDs not found in new_docs" + ) + for doc_id in ignored_ids: + logger.warning(f"Ignored document ID: {doc_id}") + # Filter new_docs to only include documents with unique IDs - new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids} + new_docs = { + doc_id: new_docs[doc_id] + for doc_id in unique_new_doc_ids + if doc_id in new_docs + } if not new_docs: logger.info("No new unique documents were found.") @@ -1435,14 +1451,22 @@ class LightRAG: logger.debug(f"Starting deletion for document {doc_id}") - doc_to_chunk_id = doc_id.replace("doc", "chunk") + # 2. Get all chunks related to this document + # Find all chunks where full_doc_id equals the current doc_id + all_chunks = await self.text_chunks.get_all() + related_chunks = { + chunk_id: chunk_data + for chunk_id, chunk_data in all_chunks.items() + if isinstance(chunk_data, dict) + and chunk_data.get("full_doc_id") == doc_id + } - # 2. Get all related chunks - chunks = await self.text_chunks.get_by_id(doc_to_chunk_id) - if not chunks: + if not related_chunks: + logger.warning(f"No chunks found for document {doc_id}") return - chunk_ids = {chunks["full_doc_id"].replace("doc", "chunk")} + # Get all related chunk IDs + chunk_ids = set(related_chunks.keys()) logger.debug(f"Found {len(chunk_ids)} chunks to delete") # 3. Before deleting, check the related entities and relationships for these chunks @@ -1630,9 +1654,18 @@ class LightRAG: logger.warning(f"Document {doc_id} still exists in full_docs") # Verify if chunks have been deleted - remaining_chunks = await self.text_chunks.get_by_id(doc_to_chunk_id) - if remaining_chunks: - logger.warning(f"Found {len(remaining_chunks)} remaining chunks") + all_remaining_chunks = await self.text_chunks.get_all() + remaining_related_chunks = { + chunk_id: chunk_data + for chunk_id, chunk_data in all_remaining_chunks.items() + if isinstance(chunk_data, dict) + and chunk_data.get("full_doc_id") == doc_id + } + + if remaining_related_chunks: + logger.warning( + f"Found {len(remaining_related_chunks)} remaining chunks" + ) # Verify entities and relationships for chunk_id in chunk_ids: