Merge branch 'main' into add-multi-worker-support

This commit is contained in:
yangdx
2025-03-01 15:55:37 +08:00
31 changed files with 1755 additions and 1371 deletions

View File

@@ -1,5 +1,5 @@
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
__version__ = "1.2.1"
__version__ = "1.2.2"
__author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/LightRAG"

View File

@@ -20,8 +20,8 @@ def create_graph_routes(rag, api_key: Optional[str] = None):
return await rag.get_graph_labels()
@router.get("/graphs", dependencies=[Depends(optional_api_key)])
async def get_knowledge_graph(label: str):
async def get_knowledge_graph(label: str, max_depth: int = 3):
"""Get knowledge graph for a specific label"""
return await rag.get_knowledge_graph(node_label=label, max_depth=3)
return await rag.get_knowledge_graph(node_label=label, max_depth=max_depth)
return router

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -2,11 +2,11 @@
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="./vite.svg" />
<link rel="icon" type="image/svg+xml" href="./logo.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Lightrag</title>
<script type="module" crossorigin src="./assets/index-BDX8o1Ld.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-CLsJV-0i.css">
<script type="module" crossorigin src="./assets/index-DbuMPJAD.js"></script>
<link rel="stylesheet" crossorigin href="./assets/index-rP-YlyR1.css">
</head>
<body>
<div id="root"></div>

BIN
lightrag/api/webui/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 155 KiB

View File

@@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="iconify iconify--logos" width="31.88" height="32" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 257"><defs><linearGradient id="IconifyId1813088fe1fbc01fb466" x1="-.828%" x2="57.636%" y1="7.652%" y2="78.411%"><stop offset="0%" stop-color="#41D1FF"></stop><stop offset="100%" stop-color="#BD34FE"></stop></linearGradient><linearGradient id="IconifyId1813088fe1fbc01fb467" x1="43.376%" x2="50.316%" y1="2.242%" y2="89.03%"><stop offset="0%" stop-color="#FFEA83"></stop><stop offset="8.333%" stop-color="#FFDD35"></stop><stop offset="100%" stop-color="#FFA800"></stop></linearGradient></defs><path fill="url(#IconifyId1813088fe1fbc01fb466)" d="M255.153 37.938L134.897 252.976c-2.483 4.44-8.862 4.466-11.382.048L.875 37.958c-2.746-4.814 1.371-10.646 6.827-9.67l120.385 21.517a6.537 6.537 0 0 0 2.322-.004l117.867-21.483c5.438-.991 9.574 4.796 6.877 9.62Z"></path><path fill="url(#IconifyId1813088fe1fbc01fb467)" d="M185.432.063L96.44 17.501a3.268 3.268 0 0 0-2.634 3.014l-5.474 92.456a3.268 3.268 0 0 0 3.997 3.378l24.777-5.718c2.318-.535 4.413 1.507 3.936 3.838l-7.361 36.047c-.495 2.426 1.782 4.5 4.151 3.78l15.304-4.649c2.372-.72 4.652 1.36 4.15 3.788l-11.698 56.621c-.732 3.542 3.979 5.473 5.943 2.437l1.313-2.028l72.516-144.72c1.215-2.423-.88-5.186-3.54-4.672l-25.505 4.922c-2.396.462-4.435-1.77-3.759-4.114l16.646-57.705c.677-2.35-1.37-4.583-3.769-4.113Z"></path></svg>

Before

Width:  |  Height:  |  Size: 1.5 KiB

View File

@@ -363,14 +363,14 @@ class LightRAG:
self.namespace_prefix, NameSpace.VECTOR_STORE_ENTITIES
),
embedding_func=self.embedding_func,
meta_fields={"entity_name"},
meta_fields={"entity_name", "source_id", "content"},
)
self.relationships_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
namespace=make_namespace(
self.namespace_prefix, NameSpace.VECTOR_STORE_RELATIONSHIPS
),
embedding_func=self.embedding_func,
meta_fields={"src_id", "tgt_id"},
meta_fields={"src_id", "tgt_id", "source_id", "content"},
)
self.chunks_vdb: BaseVectorStorage = self.vector_db_storage_cls( # type: ignore
namespace=make_namespace(
@@ -408,16 +408,31 @@ class LightRAG:
self._storages_status = StoragesStatus.CREATED
# Initialize storages
if self.auto_manage_storages_states:
loop = always_get_an_event_loop()
loop.run_until_complete(self.initialize_storages())
self._run_async_safely(self.initialize_storages, "Storage Initialization")
def __del__(self):
# Finalize storages
if self.auto_manage_storages_states:
self._run_async_safely(self.finalize_storages, "Storage Finalization")
def _run_async_safely(self, async_func, action_name=""):
"""Safely execute an async function, avoiding event loop conflicts."""
try:
loop = always_get_an_event_loop()
loop.run_until_complete(self.finalize_storages())
if loop.is_running():
task = loop.create_task(async_func())
task.add_done_callback(
lambda t: logger.info(f"{action_name} completed!")
)
else:
loop.run_until_complete(async_func())
except RuntimeError:
logger.warning(
f"No running event loop, creating a new loop for {action_name}."
)
loop = asyncio.new_event_loop()
loop.run_until_complete(async_func())
loop.close()
async def initialize_storages(self):
"""Asynchronously initialize the storages"""
@@ -491,7 +506,7 @@ class LightRAG:
input: str | list[str],
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: list[str] | None = None,
ids: str | list[str] | None = None,
) -> None:
"""Sync Insert documents with checkpoint support
@@ -500,7 +515,7 @@ class LightRAG:
split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
split_by_character_only: if split_by_character_only is True, split the string by character only, when
split_by_character is None, this parameter is ignored.
ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
ids: single string of the document ID or list of unique document IDs, if not provided, MD5 hash IDs will be generated
"""
loop = always_get_an_event_loop()
loop.run_until_complete(
@@ -512,7 +527,7 @@ class LightRAG:
input: str | list[str],
split_by_character: str | None = None,
split_by_character_only: bool = False,
ids: list[str] | None = None,
ids: str | list[str] | None = None,
) -> None:
"""Async Insert documents with checkpoint support
@@ -528,12 +543,19 @@ class LightRAG:
split_by_character, split_by_character_only
)
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]) -> None:
def insert_custom_chunks(
self,
full_text: str,
text_chunks: list[str],
doc_id: str | list[str] | None = None,
) -> None:
loop = always_get_an_event_loop()
loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks))
loop.run_until_complete(
self.ainsert_custom_chunks(full_text, text_chunks, doc_id)
)
async def ainsert_custom_chunks(
self, full_text: str, text_chunks: list[str]
self, full_text: str, text_chunks: list[str], doc_id: str | None = None
) -> None:
update_storage = False
try:
@@ -542,7 +564,10 @@ class LightRAG:
text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
# Process cleaned texts
doc_key = compute_mdhash_id(full_text, prefix="doc-")
if doc_id is None:
doc_key = compute_mdhash_id(full_text, prefix="doc-")
else:
doc_key = doc_id
new_docs = {doc_key: {"content": full_text}}
_add_doc_keys = await self.full_docs.filter_keys({doc_key})
@@ -598,6 +623,8 @@ class LightRAG:
"""
if isinstance(input, str):
input = [input]
if isinstance(ids, str):
ids = [ids]
# 1. Validate ids if provided or generate MD5 hash IDs
if ids is not None:
@@ -1366,12 +1393,14 @@ class LightRAG:
logger.debug(f"Starting deletion for document {doc_id}")
doc_to_chunk_id = doc_id.replace("doc", "chunk")
# 2. Get all related chunks
chunks = await self.text_chunks.get_by_id(doc_id)
chunks = await self.text_chunks.get_by_id(doc_to_chunk_id)
if not chunks:
return
chunk_ids = list(chunks.keys())
chunk_ids = {chunks["full_doc_id"].replace("doc", "chunk")}
logger.debug(f"Found {len(chunk_ids)} chunks to delete")
# 3. Before deleting, check the related entities and relationships for these chunks
@@ -1380,7 +1409,7 @@ class LightRAG:
entities = [
dp
for dp in self.entities_vdb.client_storage["data"]
if dp.get("source_id") == chunk_id
if chunk_id in dp.get("source_id")
]
logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities")
@@ -1388,7 +1417,7 @@ class LightRAG:
relations = [
dp
for dp in self.relationships_vdb.client_storage["data"]
if dp.get("source_id") == chunk_id
if chunk_id in dp.get("source_id")
]
logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations")
@@ -1499,42 +1528,71 @@ class LightRAG:
f"Updated {len(entities_to_update)} entities and {len(relationships_to_update)} relationships."
)
async def process_data(data_type, vdb, chunk_id):
# Check data (entities or relationships)
data_with_chunk = [
dp
for dp in vdb.client_storage["data"]
if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
]
data_for_vdb = {}
if data_with_chunk:
logger.warning(
f"found {len(data_with_chunk)} {data_type} still referencing chunk {chunk_id}"
)
for item in data_with_chunk:
old_sources = item["source_id"].split(GRAPH_FIELD_SEP)
new_sources = [src for src in old_sources if src != chunk_id]
if not new_sources:
logger.info(
f"{data_type} {item.get('entity_name', 'N/A')} is deleted because source_id is not exists"
)
await vdb.delete_entity(item)
else:
item["source_id"] = GRAPH_FIELD_SEP.join(new_sources)
item_id = item["__id__"]
data_for_vdb[item_id] = item.copy()
if data_type == "entities":
data_for_vdb[item_id]["content"] = data_for_vdb[
item_id
].get("content") or (
item.get("entity_name", "")
+ (item.get("description") or "")
)
else: # relationships
data_for_vdb[item_id]["content"] = data_for_vdb[
item_id
].get("content") or (
(item.get("keywords") or "")
+ (item.get("src_id") or "")
+ (item.get("tgt_id") or "")
+ (item.get("description") or "")
)
if data_for_vdb:
await vdb.upsert(data_for_vdb)
logger.info(f"Successfully updated {data_type} in vector DB")
# Add verification step
async def verify_deletion():
# Verify if the document has been deleted
if await self.full_docs.get_by_id(doc_id):
logger.error(f"Document {doc_id} still exists in full_docs")
logger.warning(f"Document {doc_id} still exists in full_docs")
# Verify if chunks have been deleted
remaining_chunks = await self.text_chunks.get_by_id(doc_id)
remaining_chunks = await self.text_chunks.get_by_id(doc_to_chunk_id)
if remaining_chunks:
logger.error(f"Found {len(remaining_chunks)} remaining chunks")
logger.warning(f"Found {len(remaining_chunks)} remaining chunks")
# Verify entities and relationships
for chunk_id in chunk_ids:
# Check entities
entities_with_chunk = [
dp
for dp in self.entities_vdb.client_storage["data"]
if chunk_id
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
]
if entities_with_chunk:
logger.error(
f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}"
)
# Check relationships
relations_with_chunk = [
dp
for dp in self.relationships_vdb.client_storage["data"]
if chunk_id
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
]
if relations_with_chunk:
logger.error(
f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}"
)
await process_data("entities", self.entities_vdb, chunk_id)
await process_data(
"relationships", self.relationships_vdb, chunk_id
)
await verify_deletion()

View File

@@ -323,6 +323,7 @@ async def _merge_edges_then_upsert(
tgt_id=tgt_id,
description=description,
keywords=keywords,
source_id=source_id,
)
return edge_data
@@ -365,7 +366,7 @@ async def extract_entities(
tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
entity_types=",".join(entity_types),
entity_types=", ".join(entity_types),
language=language,
)
# add example's format
@@ -562,6 +563,7 @@ async def extract_entities(
compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
"content": dp["entity_name"] + dp["description"],
"entity_name": dp["entity_name"],
"source_id": dp["source_id"],
}
for dp in all_entities_data
}
@@ -572,6 +574,7 @@ async def extract_entities(
compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
"src_id": dp["src_id"],
"tgt_id": dp["tgt_id"],
"source_id": dp["source_id"],
"content": dp["keywords"]
+ dp["src_id"]
+ dp["tgt_id"]
@@ -595,7 +598,7 @@ async def kg_query(
global_config: dict[str, str],
hashing_kv: BaseKVStorage | None = None,
system_prompt: str | None = None,
) -> str:
) -> str | AsyncIterator[str]:
# Handle cache
use_model_func = global_config["llm_model_func"]
args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
@@ -1127,7 +1130,7 @@ async def _get_node_data(
len_node_datas = len(node_datas)
node_datas = truncate_list_by_token_size(
node_datas,
key=lambda x: x["description"],
key=lambda x: x["description"] if x["description"] is not None else "",
max_token_size=query_param.max_token_for_local_context,
)
logger.debug(
@@ -1310,7 +1313,7 @@ async def _find_most_related_edges_from_entities(
)
all_edges_data = truncate_list_by_token_size(
all_edges_data,
key=lambda x: x["description"],
key=lambda x: x["description"] if x["description"] is not None else "",
max_token_size=query_param.max_token_for_global_context,
)
@@ -1364,7 +1367,7 @@ async def _get_edge_data(
)
edge_datas = truncate_list_by_token_size(
edge_datas,
key=lambda x: x["description"],
key=lambda x: x["description"] if x["description"] is not None else "",
max_token_size=query_param.max_token_for_global_context,
)
use_entities, use_text_units = await asyncio.gather(
@@ -1468,7 +1471,7 @@ async def _find_most_related_entities_from_relationships(
len_node_datas = len(node_datas)
node_datas = truncate_list_by_token_size(
node_datas,
key=lambda x: x["description"],
key=lambda x: x["description"] if x["description"] is not None else "",
max_token_size=query_param.max_token_for_local_context,
)
logger.debug(

View File

@@ -47,8 +47,9 @@ Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_
#############################
---Real Data---
######################
Entity_types: {entity_types}
Text: {input_text}
Entity_types: [{entity_types}]
Text:
{input_text}
######################
Output:"""