From 339bc992593d4b4bf070217f20d093be438dc46a Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 10 Apr 2025 20:31:52 +0800 Subject: [PATCH 1/3] Only merge new entities/edges during gleaning - Restrict gleaning to new entity names - Only add edges with new keys - Prevent similar decription of the same entity or edge --- lightrag/operate.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index b1b803db..2a64a791 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -613,11 +613,13 @@ async def extract_entities( glean_result, chunk_key, file_path ) - # Merge results + # Merge results - only add entities and edges with new names for entity_name, entities in glean_nodes.items(): - maybe_nodes[entity_name].extend(entities) + if entity_name not in maybe_nodes: # Only accetp entities with new name in gleaning stage + maybe_nodes[entity_name].extend(entities) for edge_key, edges in glean_edges.items(): - maybe_edges[edge_key].extend(edges) + if edge_key not in maybe_edges: # Only accetp edges with new name in gleaning stage + maybe_edges[edge_key].extend(edges) if now_glean_index == entity_extract_max_gleaning - 1: break From 7d69449c670bd394d0c6f841867ded428b428efd Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 10 Apr 2025 20:32:40 +0800 Subject: [PATCH 2/3] Fix linting --- lightrag/operate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 2a64a791..5d67b21b 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -615,10 +615,14 @@ async def extract_entities( # Merge results - only add entities and edges with new names for entity_name, entities in glean_nodes.items(): - if entity_name not in maybe_nodes: # Only accetp entities with new name in gleaning stage + if ( + entity_name not in maybe_nodes + ): # Only accetp entities with new name in gleaning stage maybe_nodes[entity_name].extend(entities) for edge_key, edges in glean_edges.items(): - if edge_key not in maybe_edges: # Only accetp edges with new name in gleaning stage + if ( + edge_key not in maybe_edges + ): # Only accetp edges with new name in gleaning stage maybe_edges[edge_key].extend(edges) if now_glean_index == entity_extract_max_gleaning - 1: From 96f439bb52186a268b313d1324632bb2a760091a Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 10 Apr 2025 21:19:26 +0800 Subject: [PATCH 3/3] Optimize pipeline status message --- lightrag/lightrag.py | 7 +++++++ lightrag/operate.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 153dc27a..0933a4d1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -902,6 +902,13 @@ class LightRAG: # Get file path from status document file_path = getattr(status_doc, "file_path", "unknown_source") + async with pipeline_status_lock: + log_message = f"Processing file: {file_path}" + pipeline_status["history_messages"].append(log_message) + log_message = f"Processing d-id: {doc_id}" + pipeline_status["latest_message"] = log_message + pipeline_status["history_messages"].append(log_message) + # Generate chunks from document chunks: dict[str, Any] = { compute_mdhash_id(dp["content"], prefix="chunk-"): { diff --git a/lightrag/operate.py b/lightrag/operate.py index 5d67b21b..02d9c85e 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -642,7 +642,7 @@ async def extract_entities( processed_chunks += 1 entities_count = len(maybe_nodes) relations_count = len(maybe_edges) - log_message = f" Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)" + log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)" logger.info(log_message) if pipeline_status is not None: async with pipeline_status_lock: