This commit is contained in:
zrguo
2025-03-17 23:36:00 +08:00
parent bf18a5406e
commit 6115f60072
2 changed files with 73 additions and 45 deletions

View File

@@ -563,7 +563,9 @@ class LightRAG:
""" """
loop = always_get_an_event_loop() loop = always_get_an_event_loop()
loop.run_until_complete( loop.run_until_complete(
self.ainsert(input, split_by_character, split_by_character_only, ids, file_paths) self.ainsert(
input, split_by_character, split_by_character_only, ids, file_paths
)
) )
async def ainsert( async def ainsert(
@@ -659,7 +661,10 @@ class LightRAG:
await self._insert_done() await self._insert_done()
async def apipeline_enqueue_documents( async def apipeline_enqueue_documents(
self, input: str | list[str], ids: list[str] | None = None, file_paths: str | list[str] | None = None self,
input: str | list[str],
ids: list[str] | None = None,
file_paths: str | list[str] | None = None,
) -> None: ) -> None:
""" """
Pipeline for Processing Documents Pipeline for Processing Documents
@@ -687,7 +692,9 @@ class LightRAG:
if isinstance(file_paths, str): if isinstance(file_paths, str):
file_paths = [file_paths] file_paths = [file_paths]
if len(file_paths) != len(input): if len(file_paths) != len(input):
raise ValueError("Number of file paths must match the number of documents") raise ValueError(
"Number of file paths must match the number of documents"
)
else: else:
# If no file paths provided, use placeholder # If no file paths provided, use placeholder
file_paths = ["unknown_source"] * len(input) file_paths = ["unknown_source"] * len(input)
@@ -703,11 +710,15 @@ class LightRAG:
raise ValueError("IDs must be unique") raise ValueError("IDs must be unique")
# Generate contents dict of IDs provided by user and documents # Generate contents dict of IDs provided by user and documents
contents = {id_: {"content": doc, "file_path": path} contents = {
for id_, doc, path in zip(ids, input, file_paths)} id_: {"content": doc, "file_path": path}
for id_, doc, path in zip(ids, input, file_paths)
}
else: else:
# Clean input text and remove duplicates # Clean input text and remove duplicates
cleaned_input = [(clean_text(doc), path) for doc, path in zip(input, file_paths)] cleaned_input = [
(clean_text(doc), path) for doc, path in zip(input, file_paths)
]
unique_content_with_paths = {} unique_content_with_paths = {}
# Keep track of unique content and their paths # Keep track of unique content and their paths
@@ -716,9 +727,13 @@ class LightRAG:
unique_content_with_paths[content] = path unique_content_with_paths[content] = path
# Generate contents dict of MD5 hash IDs and documents with paths # Generate contents dict of MD5 hash IDs and documents with paths
contents = {compute_mdhash_id(content, prefix="doc-"): contents = {
{"content": content, "file_path": path} compute_mdhash_id(content, prefix="doc-"): {
for content, path in unique_content_with_paths.items()} "content": content,
"file_path": path,
}
for content, path in unique_content_with_paths.items()
}
# 2. Remove duplicate contents # 2. Remove duplicate contents
unique_contents = {} unique_contents = {}
@@ -729,8 +744,10 @@ class LightRAG:
unique_contents[content] = (id_, file_path) unique_contents[content] = (id_, file_path)
# Reconstruct contents with unique content # Reconstruct contents with unique content
contents = {id_: {"content": content, "file_path": file_path} contents = {
for content, (id_, file_path) in unique_contents.items()} id_: {"content": content, "file_path": file_path}
for content, (id_, file_path) in unique_contents.items()
}
# 3. Generate document initial status # 3. Generate document initial status
new_docs: dict[str, Any] = { new_docs: dict[str, Any] = {
@@ -741,7 +758,9 @@ class LightRAG:
"content_length": len(content_data["content"]), "content_length": len(content_data["content"]),
"created_at": datetime.now().isoformat(), "created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat(), "updated_at": datetime.now().isoformat(),
"file_path": content_data["file_path"], # Store file path in document status "file_path": content_data[
"file_path"
], # Store file path in document status
} }
for id_, content_data in contents.items() for id_, content_data in contents.items()
} }
@@ -1109,7 +1128,10 @@ class LightRAG:
loop.run_until_complete(self.ainsert_custom_kg(custom_kg, full_doc_id)) loop.run_until_complete(self.ainsert_custom_kg(custom_kg, full_doc_id))
async def ainsert_custom_kg( async def ainsert_custom_kg(
self, custom_kg: dict[str, Any], full_doc_id: str = None, file_path: str = "custom_kg" self,
custom_kg: dict[str, Any],
full_doc_id: str = None,
file_path: str = "custom_kg",
) -> None: ) -> None:
update_storage = False update_storage = False
try: try:
@@ -3125,4 +3147,3 @@ class LightRAG:
] ]
] ]
) )

View File

@@ -224,7 +224,9 @@ async def _merge_nodes_then_upsert(
split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP]) split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
) )
already_file_paths.extend( already_file_paths.extend(
split_string_by_multi_markers(already_node["metadata"]["file_path"], [GRAPH_FIELD_SEP]) split_string_by_multi_markers(
already_node["metadata"]["file_path"], [GRAPH_FIELD_SEP]
)
) )
already_description.append(already_node["description"]) already_description.append(already_node["description"])
@@ -336,7 +338,14 @@ async def _merge_edges_then_upsert(
) )
) )
file_path = GRAPH_FIELD_SEP.join( file_path = GRAPH_FIELD_SEP.join(
set([dp["metadata"]["file_path"] for dp in edges_data if dp.get("metadata", {}).get("file_path")] + already_file_paths) set(
[
dp["metadata"]["file_path"]
for dp in edges_data
if dp.get("metadata", {}).get("file_path")
]
+ already_file_paths
)
) )
for need_insert_id in [src_id, tgt_id]: for need_insert_id in [src_id, tgt_id]:
@@ -482,7 +491,9 @@ async def extract_entities(
else: else:
return await use_llm_func(input_text) return await use_llm_func(input_text)
async def _process_extraction_result(result: str, chunk_key: str, file_path: str = "unknown_source"): async def _process_extraction_result(
result: str, chunk_key: str, file_path: str = "unknown_source"
):
"""Process a single extraction result (either initial or gleaning) """Process a single extraction result (either initial or gleaning)
Args: Args:
result (str): The extraction result to process result (str): The extraction result to process
@@ -669,7 +680,9 @@ async def extract_entities(
"file_path": dp.get("metadata", {}).get("file_path", "unknown_source"), "file_path": dp.get("metadata", {}).get("file_path", "unknown_source"),
"metadata": { "metadata": {
"created_at": dp.get("metadata", {}).get("created_at", time.time()), "created_at": dp.get("metadata", {}).get("created_at", time.time()),
"file_path": dp.get("metadata", {}).get("file_path", "unknown_source"), "file_path": dp.get("metadata", {}).get(
"file_path", "unknown_source"
),
}, },
} }
for dp in all_entities_data for dp in all_entities_data
@@ -687,7 +700,9 @@ async def extract_entities(
"file_path": dp.get("metadata", {}).get("file_path", "unknown_source"), "file_path": dp.get("metadata", {}).get("file_path", "unknown_source"),
"metadata": { "metadata": {
"created_at": dp.get("metadata", {}).get("created_at", time.time()), "created_at": dp.get("metadata", {}).get("created_at", time.time()),
"file_path": dp.get("metadata", {}).get("file_path", "unknown_source"), "file_path": dp.get("metadata", {}).get(
"file_path", "unknown_source"
),
}, },
} }
for dp in all_relationships_data for dp in all_relationships_data
@@ -1574,15 +1589,7 @@ async def _get_edge_data(
relations_context = list_of_list_to_csv(relations_section_list) relations_context = list_of_list_to_csv(relations_section_list)
entites_section_list = [ entites_section_list = [
[ ["id", "entity", "type", "description", "rank", "created_at", "file_path"]
"id",
"entity",
"type",
"description",
"rank",
"created_at",
"file_path"
]
] ]
for i, n in enumerate(use_entities): for i, n in enumerate(use_entities):
created_at = n.get("created_at", "Unknown") created_at = n.get("created_at", "Unknown")