Merge branch 'main' into Add-summary-lang-support

This commit is contained in:
yangdx
2025-03-04 14:02:21 +08:00
16 changed files with 194 additions and 91 deletions

1
MANIFEST.in Normal file
View File

@@ -0,0 +1 @@
recursive-include lightrag/api/webui *

View File

@@ -505,44 +505,58 @@ rag.query_with_separate_keyword_extraction(
```python ```python
custom_kg = { custom_kg = {
"chunks": [
{
"content": "Alice and Bob are collaborating on quantum computing research.",
"source_id": "doc-1"
}
],
"entities": [ "entities": [
{ {
"entity_name": "CompanyA", "entity_name": "Alice",
"entity_type": "Organization", "entity_type": "person",
"description": "A major technology company", "description": "Alice is a researcher specializing in quantum physics.",
"source_id": "Source1" "source_id": "doc-1"
}, },
{ {
"entity_name": "ProductX", "entity_name": "Bob",
"entity_type": "Product", "entity_type": "person",
"description": "A popular product developed by CompanyA", "description": "Bob is a mathematician.",
"source_id": "Source1" "source_id": "doc-1"
},
{
"entity_name": "Quantum Computing",
"entity_type": "technology",
"description": "Quantum computing utilizes quantum mechanical phenomena for computation.",
"source_id": "doc-1"
} }
], ],
"relationships": [ "relationships": [
{ {
"src_id": "CompanyA", "src_id": "Alice",
"tgt_id": "ProductX", "tgt_id": "Bob",
"description": "CompanyA develops ProductX", "description": "Alice and Bob are research partners.",
"keywords": "develop, produce", "keywords": "collaboration research",
"weight": 1.0, "weight": 1.0,
"source_id": "Source1" "source_id": "doc-1"
},
{
"src_id": "Alice",
"tgt_id": "Quantum Computing",
"description": "Alice conducts research on quantum computing.",
"keywords": "research expertise",
"weight": 1.0,
"source_id": "doc-1"
},
{
"src_id": "Bob",
"tgt_id": "Quantum Computing",
"description": "Bob researches quantum computing.",
"keywords": "research application",
"weight": 1.0,
"source_id": "doc-1"
} }
], ]
"chunks": [
{
"content": "ProductX, developed by CompanyA, has revolutionized the market with its cutting-edge features.",
"source_id": "Source1",
},
{
"content": "PersonA is a prominent researcher at UniversityB, focusing on artificial intelligence and machine learning.",
"source_id": "Source2",
},
{
"content": "None",
"source_id": "UNKNOWN",
},
],
} }
rag.insert_custom_kg(custom_kg) rag.insert_custom_kg(custom_kg)
@@ -655,16 +669,19 @@ setup_logger("lightrag", level="INFO")
# Note: Default settings use NetworkX # Note: Default settings use NetworkX
# Initialize LightRAG with Neo4J implementation. # Initialize LightRAG with Neo4J implementation.
rag = LightRAG( async def initialize_rag():
working_dir=WORKING_DIR, rag = LightRAG(
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model working_dir=WORKING_DIR,
graph_storage="Neo4JStorage", #<-----------override KG default llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
) graph_storage="Neo4JStorage", #<-----------override KG default
)
# Initialize database connections # Initialize database connections
await rag.initialize_storages() await rag.initialize_storages()
# Initialize pipeline status for document processing # Initialize pipeline status for document processing
await initialize_pipeline_status() await initialize_pipeline_status()
return rag
``` ```
see test_neo4j.py for a working example. see test_neo4j.py for a working example.
@@ -768,7 +785,8 @@ rag.delete_by_doc_id("doc_id")
LightRAG now supports comprehensive knowledge graph management capabilities, allowing you to create, edit, and delete entities and relationships within your knowledge graph. LightRAG now supports comprehensive knowledge graph management capabilities, allowing you to create, edit, and delete entities and relationships within your knowledge graph.
### Create Entities and Relations <details>
<summary> <b>Create Entities and Relations</b> </summary>
```python ```python
# Create new entity # Create new entity
@@ -790,8 +808,10 @@ relation = rag.create_relation("Google", "Gmail", {
"weight": 2.0 "weight": 2.0
}) })
``` ```
</details>
### Edit Entities and Relations <details>
<summary> <b>Edit Entities and Relations</b> </summary>
```python ```python
# Edit an existing entity # Edit an existing entity
@@ -813,6 +833,7 @@ updated_relation = rag.edit_relation("Google", "Google Mail", {
"weight": 3.0 "weight": 3.0
}) })
``` ```
</details>
All operations are available in both synchronous and asynchronous versions. The asynchronous versions have the prefix "a" (e.g., `acreate_entity`, `aedit_relation`). All operations are available in both synchronous and asynchronous versions. The asynchronous versions have the prefix "a" (e.g., `acreate_entity`, `aedit_relation`).

View File

@@ -81,34 +81,46 @@ asyncio.run(test_funcs())
embedding_dimension = 3072 embedding_dimension = 3072
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=8192,
func=embedding_func,
),
)
rag.initialize_storages() async def initialize_rag():
initialize_pipeline_status() rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=8192,
func=embedding_func,
),
)
book1 = open("./book_1.txt", encoding="utf-8") await rag.initialize_storages()
book2 = open("./book_2.txt", encoding="utf-8") await initialize_pipeline_status()
rag.insert([book1.read(), book2.read()]) return rag
query_text = "What are the main themes?"
print("Result (Naive):") def main():
print(rag.query(query_text, param=QueryParam(mode="naive"))) rag = asyncio.run(initialize_rag())
print("\nResult (Local):") book1 = open("./book_1.txt", encoding="utf-8")
print(rag.query(query_text, param=QueryParam(mode="local"))) book2 = open("./book_2.txt", encoding="utf-8")
print("\nResult (Global):") rag.insert([book1.read(), book2.read()])
print(rag.query(query_text, param=QueryParam(mode="global")))
print("\nResult (Hybrid):") query_text = "What are the main themes?"
print(rag.query(query_text, param=QueryParam(mode="hybrid")))
print("Result (Naive):")
print(rag.query(query_text, param=QueryParam(mode="naive")))
print("\nResult (Local):")
print(rag.query(query_text, param=QueryParam(mode="local")))
print("\nResult (Global):")
print(rag.query(query_text, param=QueryParam(mode="global")))
print("\nResult (Hybrid):")
print(rag.query(query_text, param=QueryParam(mode="hybrid")))
if __name__ == "__main__":
main()

View File

@@ -53,3 +53,7 @@ def main():
"What are the top themes in this story?", param=QueryParam(mode=mode) "What are the top themes in this story?", param=QueryParam(mode=mode)
) )
) )
if __name__ == "__main__":
main()

View File

@@ -125,7 +125,7 @@ async def initialize_rag():
async def main(): async def main():
try: try:
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
# reading file # reading file
with open("./book.txt", "r", encoding="utf-8") as f: with open("./book.txt", "r", encoding="utf-8") as f:

View File

@@ -77,7 +77,7 @@ async def initialize_rag():
async def main(): async def main():
try: try:
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
with open("./book.txt", "r", encoding="utf-8") as f: with open("./book.txt", "r", encoding="utf-8") as f:
await rag.ainsert(f.read()) await rag.ainsert(f.read())

View File

@@ -81,7 +81,7 @@ async def initialize_rag():
async def main(): async def main():
try: try:
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
with open("./book.txt", "r", encoding="utf-8") as f: with open("./book.txt", "r", encoding="utf-8") as f:
await rag.ainsert(f.read()) await rag.ainsert(f.read())

View File

@@ -107,7 +107,7 @@ async def initialize_rag():
async def main(): async def main():
try: try:
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
# Extract and Insert into LightRAG storage # Extract and Insert into LightRAG storage
with open(WORKING_DIR + "/docs.txt", "r", encoding="utf-8") as f: with open(WORKING_DIR + "/docs.txt", "r", encoding="utf-8") as f:

View File

@@ -87,7 +87,7 @@ async def initialize_rag():
async def main(): async def main():
try: try:
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
with open("./book.txt", "r", encoding="utf-8") as f: with open("./book.txt", "r", encoding="utf-8") as f:
rag.insert(f.read()) rag.insert(f.read())

View File

@@ -59,7 +59,7 @@ async def initialize_rag():
async def main(): async def main():
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
# add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c # add embedding_func for graph database, it's deleted in commit 5661d76860436f7bf5aef2e50d9ee4a59660146c
rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func rag.chunk_entity_relation_graph.embedding_func = rag.embedding_func

View File

@@ -102,7 +102,7 @@ async def initialize_rag():
# Example function demonstrating the new query_with_separate_keyword_extraction usage # Example function demonstrating the new query_with_separate_keyword_extraction usage
async def run_example(): async def run_example():
# Initialize RAG instance # Initialize RAG instance
rag = asyncio.run(initialize_rag()) rag = await initialize_rag()
book1 = open("./book_1.txt", encoding="utf-8") book1 = open("./book_1.txt", encoding="utf-8")
book2 = open("./book_2.txt", encoding="utf-8") book2 = open("./book_2.txt", encoding="utf-8")

View File

@@ -6,7 +6,6 @@ from fastapi import (
FastAPI, FastAPI,
Depends, Depends,
) )
from fastapi.responses import FileResponse
import asyncio import asyncio
import os import os
import logging import logging
@@ -408,10 +407,6 @@ def create_app(args):
name="webui", name="webui",
) )
@app.get("/webui/")
async def webui_root():
return FileResponse(static_dir / "index.html")
return app return app

View File

@@ -215,9 +215,29 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
| ".scss" | ".scss"
| ".less" | ".less"
): ):
content = file.decode("utf-8") try:
# Try to decode as UTF-8
content = file.decode("utf-8")
# Validate content
if not content or len(content.strip()) == 0:
logger.error(f"Empty content in file: {file_path.name}")
return False
# Check if content looks like binary data string representation
if content.startswith("b'") or content.startswith('b"'):
logger.error(
f"File {file_path.name} appears to contain binary data representation instead of text"
)
return False
except UnicodeDecodeError:
logger.error(
f"File {file_path.name} is not valid UTF-8 encoded text. Please convert it to UTF-8 before processing."
)
return False
case ".pdf": case ".pdf":
if not pm.is_installed("pypdf2"): if not pm.is_installed("pypdf2"): # type: ignore
pm.install("pypdf2") pm.install("pypdf2")
from PyPDF2 import PdfReader # type: ignore from PyPDF2 import PdfReader # type: ignore
from io import BytesIO from io import BytesIO
@@ -227,18 +247,18 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
for page in reader.pages: for page in reader.pages:
content += page.extract_text() + "\n" content += page.extract_text() + "\n"
case ".docx": case ".docx":
if not pm.is_installed("docx"): if not pm.is_installed("python-docx"): # type: ignore
pm.install("docx") pm.install("docx")
from docx import Document from docx import Document # type: ignore
from io import BytesIO from io import BytesIO
docx_file = BytesIO(file) docx_file = BytesIO(file)
doc = Document(docx_file) doc = Document(docx_file)
content = "\n".join([paragraph.text for paragraph in doc.paragraphs]) content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
case ".pptx": case ".pptx":
if not pm.is_installed("pptx"): if not pm.is_installed("python-pptx"): # type: ignore
pm.install("pptx") pm.install("pptx")
from pptx import Presentation from pptx import Presentation # type: ignore
from io import BytesIO from io import BytesIO
pptx_file = BytesIO(file) pptx_file = BytesIO(file)
@@ -248,9 +268,9 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
if hasattr(shape, "text"): if hasattr(shape, "text"):
content += shape.text + "\n" content += shape.text + "\n"
case ".xlsx": case ".xlsx":
if not pm.is_installed("openpyxl"): if not pm.is_installed("openpyxl"): # type: ignore
pm.install("openpyxl") pm.install("openpyxl")
from openpyxl import load_workbook from openpyxl import load_workbook # type: ignore
from io import BytesIO from io import BytesIO
xlsx_file = BytesIO(file) xlsx_file = BytesIO(file)

View File

@@ -44,6 +44,15 @@ class JsonKVStorage(BaseKVStorage):
) )
write_json(data_dict, self._file_name) write_json(data_dict, self._file_name)
async def get_all(self) -> dict[str, Any]:
"""Get all data from storage
Returns:
Dictionary containing all stored data
"""
async with self._storage_lock:
return dict(self._data)
async def get_by_id(self, id: str) -> dict[str, Any] | None: async def get_by_id(self, id: str) -> dict[str, Any] | None:
async with self._storage_lock: async with self._storage_lock:
return self._data.get(id) return self._data.get(id)

View File

@@ -174,6 +174,14 @@ class TiDBKVStorage(BaseKVStorage):
self.db = None self.db = None
################ QUERY METHODS ################ ################ QUERY METHODS ################
async def get_all(self) -> dict[str, Any]:
"""Get all data from storage
Returns:
Dictionary containing all stored data
"""
async with self._storage_lock:
return dict(self._data)
async def get_by_id(self, id: str) -> dict[str, Any] | None: async def get_by_id(self, id: str) -> dict[str, Any] | None:
"""Fetch doc_full data by id.""" """Fetch doc_full data by id."""

View File

@@ -689,8 +689,24 @@ class LightRAG:
all_new_doc_ids = set(new_docs.keys()) all_new_doc_ids = set(new_docs.keys())
# Exclude IDs of documents that are already in progress # Exclude IDs of documents that are already in progress
unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids) unique_new_doc_ids = await self.doc_status.filter_keys(all_new_doc_ids)
# Log ignored document IDs
ignored_ids = [
doc_id for doc_id in unique_new_doc_ids if doc_id not in new_docs
]
if ignored_ids:
logger.warning(
f"Ignoring {len(ignored_ids)} document IDs not found in new_docs"
)
for doc_id in ignored_ids:
logger.warning(f"Ignored document ID: {doc_id}")
# Filter new_docs to only include documents with unique IDs # Filter new_docs to only include documents with unique IDs
new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids} new_docs = {
doc_id: new_docs[doc_id]
for doc_id in unique_new_doc_ids
if doc_id in new_docs
}
if not new_docs: if not new_docs:
logger.info("No new unique documents were found.") logger.info("No new unique documents were found.")
@@ -1435,14 +1451,22 @@ class LightRAG:
logger.debug(f"Starting deletion for document {doc_id}") logger.debug(f"Starting deletion for document {doc_id}")
doc_to_chunk_id = doc_id.replace("doc", "chunk") # 2. Get all chunks related to this document
# Find all chunks where full_doc_id equals the current doc_id
all_chunks = await self.text_chunks.get_all()
related_chunks = {
chunk_id: chunk_data
for chunk_id, chunk_data in all_chunks.items()
if isinstance(chunk_data, dict)
and chunk_data.get("full_doc_id") == doc_id
}
# 2. Get all related chunks if not related_chunks:
chunks = await self.text_chunks.get_by_id(doc_to_chunk_id) logger.warning(f"No chunks found for document {doc_id}")
if not chunks:
return return
chunk_ids = {chunks["full_doc_id"].replace("doc", "chunk")} # Get all related chunk IDs
chunk_ids = set(related_chunks.keys())
logger.debug(f"Found {len(chunk_ids)} chunks to delete") logger.debug(f"Found {len(chunk_ids)} chunks to delete")
# 3. Before deleting, check the related entities and relationships for these chunks # 3. Before deleting, check the related entities and relationships for these chunks
@@ -1630,9 +1654,18 @@ class LightRAG:
logger.warning(f"Document {doc_id} still exists in full_docs") logger.warning(f"Document {doc_id} still exists in full_docs")
# Verify if chunks have been deleted # Verify if chunks have been deleted
remaining_chunks = await self.text_chunks.get_by_id(doc_to_chunk_id) all_remaining_chunks = await self.text_chunks.get_all()
if remaining_chunks: remaining_related_chunks = {
logger.warning(f"Found {len(remaining_chunks)} remaining chunks") chunk_id: chunk_data
for chunk_id, chunk_data in all_remaining_chunks.items()
if isinstance(chunk_data, dict)
and chunk_data.get("full_doc_id") == doc_id
}
if remaining_related_chunks:
logger.warning(
f"Found {len(remaining_related_chunks)} remaining chunks"
)
# Verify entities and relationships # Verify entities and relationships
for chunk_id in chunk_ids: for chunk_id in chunk_ids: