chunk split retry

This commit is contained in:
童石渊
2025-01-07 16:26:12 +08:00
parent 059e3882f1
commit 6b19401dc6
3 changed files with 886 additions and 135 deletions

View File

@@ -268,7 +268,7 @@ class LightRAG:
self.llm_model_func,
hashing_kv=self.llm_response_cache
if self.llm_response_cache
and hasattr(self.llm_response_cache, "global_config")
and hasattr(self.llm_response_cache, "global_config")
else self.key_string_value_json_storage_cls(
namespace="llm_response_cache",
global_config=asdict(self),
@@ -316,7 +316,9 @@ class LightRAG:
def insert(self, string_or_strings, split_by_character=None):
loop = always_get_an_event_loop()
return loop.run_until_complete(self.ainsert(string_or_strings, split_by_character))
return loop.run_until_complete(
self.ainsert(string_or_strings, split_by_character)
)
async def ainsert(self, string_or_strings, split_by_character):
"""Insert documents with checkpoint support
@@ -357,10 +359,10 @@ class LightRAG:
# Process documents in batches
batch_size = self.addon_params.get("insert_batch_size", 10)
for i in range(0, len(new_docs), batch_size):
batch_docs = dict(list(new_docs.items())[i: i + batch_size])
batch_docs = dict(list(new_docs.items())[i : i + batch_size])
for doc_id, doc in tqdm_async(
batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}"
batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}"
):
try:
# Update status to processing
@@ -548,7 +550,7 @@ class LightRAG:
# Check if nodes exist in the knowledge graph
for need_insert_id in [src_id, tgt_id]:
if not (
await self.chunk_entity_relation_graph.has_node(need_insert_id)
await self.chunk_entity_relation_graph.has_node(need_insert_id)
):
await self.chunk_entity_relation_graph.upsert_node(
need_insert_id,
@@ -597,9 +599,9 @@ class LightRAG:
"src_id": dp["src_id"],
"tgt_id": dp["tgt_id"],
"content": dp["keywords"]
+ dp["src_id"]
+ dp["tgt_id"]
+ dp["description"],
+ dp["src_id"]
+ dp["tgt_id"]
+ dp["description"],
}
for dp in all_relationships_data
}
@@ -624,7 +626,7 @@ class LightRAG:
asdict(self),
hashing_kv=self.llm_response_cache
if self.llm_response_cache
and hasattr(self.llm_response_cache, "global_config")
and hasattr(self.llm_response_cache, "global_config")
else self.key_string_value_json_storage_cls(
namespace="llm_response_cache",
global_config=asdict(self),
@@ -640,7 +642,7 @@ class LightRAG:
asdict(self),
hashing_kv=self.llm_response_cache
if self.llm_response_cache
and hasattr(self.llm_response_cache, "global_config")
and hasattr(self.llm_response_cache, "global_config")
else self.key_string_value_json_storage_cls(
namespace="llm_response_cache",
global_config=asdict(self),
@@ -659,7 +661,7 @@ class LightRAG:
asdict(self),
hashing_kv=self.llm_response_cache
if self.llm_response_cache
and hasattr(self.llm_response_cache, "global_config")
and hasattr(self.llm_response_cache, "global_config")
else self.key_string_value_json_storage_cls(
namespace="llm_response_cache",
global_config=asdict(self),
@@ -900,7 +902,7 @@ class LightRAG:
dp
for dp in self.entities_vdb.client_storage["data"]
if chunk_id
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
]
if entities_with_chunk:
logger.error(
@@ -912,7 +914,7 @@ class LightRAG:
dp
for dp in self.relationships_vdb.client_storage["data"]
if chunk_id
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
]
if relations_with_chunk:
logger.error(
@@ -929,7 +931,7 @@ class LightRAG:
return asyncio.run(self.adelete_by_doc_id(doc_id))
async def get_entity_info(
self, entity_name: str, include_vector_data: bool = False
self, entity_name: str, include_vector_data: bool = False
):
"""Get detailed information of an entity
@@ -980,7 +982,7 @@ class LightRAG:
tracemalloc.stop()
async def get_relation_info(
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
):
"""Get detailed information of a relationship
@@ -1022,7 +1024,7 @@ class LightRAG:
return result
def get_relation_info_sync(
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
):
"""Synchronous version of getting relationship information