From 6972b3e1ccc7ea5a59d192c95210d16096fed335 Mon Sep 17 00:00:00 2001 From: Pankaj Kaushal Date: Tue, 18 Mar 2025 12:08:42 +0100 Subject: [PATCH 01/11] Refactor logger setup to conditionally enable file logging. Added error handling for file creation issues. --- lightrag/utils.py | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 362e5531..282042f6 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -109,15 +109,17 @@ def setup_logger( logger_name: str, level: str = "INFO", add_filter: bool = False, - log_file_path: str = None, + log_file_path: str | None = None, + enable_file_logging: bool = True, ): - """Set up a logger with console and file handlers + """Set up a logger with console and optionally file handlers Args: logger_name: Name of the logger to set up level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) add_filter: Whether to add LightragPathFilter to the logger - log_file_path: Path to the log file. If None, will use current directory/lightrag.log + log_file_path: Path to the log file. If None and file logging is enabled, defaults to lightrag.log in LOG_DIR or cwd + enable_file_logging: Whether to enable logging to a file (defaults to True) """ # Configure formatters detailed_formatter = logging.Formatter( @@ -125,18 +127,6 @@ def setup_logger( ) simple_formatter = logging.Formatter("%(levelname)s: %(message)s") - # Get log file path - if log_file_path is None: - log_dir = os.getenv("LOG_DIR", os.getcwd()) - log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log")) - - # Ensure log directory exists - os.makedirs(os.path.dirname(log_file_path), exist_ok=True) - - # Get log file max size and backup count from environment variables - log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB - log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups - logger_instance = logging.getLogger(logger_name) logger_instance.setLevel(level) logger_instance.handlers = [] # Clear existing handlers @@ -148,16 +138,34 @@ def setup_logger( console_handler.setLevel(level) logger_instance.addHandler(console_handler) - # Add file handler - file_handler = logging.handlers.RotatingFileHandler( - filename=log_file_path, - maxBytes=log_max_bytes, - backupCount=log_backup_count, - encoding="utf-8", - ) - file_handler.setFormatter(detailed_formatter) - file_handler.setLevel(level) - logger_instance.addHandler(file_handler) + # Add file handler by default unless explicitly disabled + if enable_file_logging: + # Get log file path + if log_file_path is None: + log_dir = os.getenv("LOG_DIR", os.getcwd()) + log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log")) + + # Ensure log directory exists + os.makedirs(os.path.dirname(log_file_path), exist_ok=True) + + # Get log file max size and backup count from environment variables + log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB + log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups + + try: + # Add file handler + file_handler = logging.handlers.RotatingFileHandler( + filename=log_file_path, + maxBytes=log_max_bytes, + backupCount=log_backup_count, + encoding="utf-8", + ) + file_handler.setFormatter(detailed_formatter) + file_handler.setLevel(level) + logger_instance.addHandler(file_handler) + except PermissionError as e: + logger.warning(f"Could not create log file at {log_file_path}: {str(e)}") + logger.warning("Continuing with console logging only") # Add path filter if requested if add_filter: From 1530798fb6107dac91d905d93dd653a5e4e07646 Mon Sep 17 00:00:00 2001 From: HowardChan Date: Wed, 19 Mar 2025 12:59:44 +0800 Subject: [PATCH 02/11] fix: ensure chunk_ids are correctly handled as VARCHAR array --- lightrag/kg/postgres_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index fcad869c..6a888464 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1570,7 +1570,7 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, - chunk_id TEXT NULL, + chunk_ids VARCHAR(255)[] NULL, file_path TEXT NULL, CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id) )""" @@ -1585,7 +1585,7 @@ TABLES = { content_vector VECTOR, create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, update_time TIMESTAMP, - chunk_id TEXT NULL, + chunk_ids VARCHAR(255)[] NULL, file_path TEXT NULL, CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id) )""" From 0c39ff00a820ff98e3cccf206861db97eb6f4051 Mon Sep 17 00:00:00 2001 From: HowardChan Date: Wed, 19 Mar 2025 13:01:07 +0800 Subject: [PATCH 03/11] fix: resolve SQL type mismatch and conflict handling in LIGHTRAG_VDB_ENTITY insert query --- lightrag/kg/postgres_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 6a888464..51f58b4e 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1673,7 +1673,7 @@ SQL_TEMPLATES = { """, "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector, chunk_ids, file_path) - VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7::varchar[]) + VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7) ON CONFLICT (workspace,id) DO UPDATE SET entity_name=EXCLUDED.entity_name, content=EXCLUDED.content, @@ -1684,7 +1684,7 @@ SQL_TEMPLATES = { """, "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id, target_id, content, content_vector, chunk_ids, file_path) - VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8::varchar[]) + VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8) ON CONFLICT (workspace,id) DO UPDATE SET source_id=EXCLUDED.source_id, target_id=EXCLUDED.target_id, From 7e53d94526754cff0950569e1241cb0d151b4f7d Mon Sep 17 00:00:00 2001 From: jofoks Date: Wed, 19 Mar 2025 15:24:25 -0700 Subject: [PATCH 04/11] Implemented simple PSQL doc-status get_by_ids method --- lightrag/kg/postgres_impl.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index fcad869c..20ffc008 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -747,8 +747,30 @@ class PGDocStatusStorage(DocStatusStorage): ) async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]: - """Get doc_chunks data by id""" - raise NotImplementedError + """Get doc_chunks data by multiple IDs.""" + if not ids: + return [] + + sql = "SELECT * FROM LIGHTRAG_DOC_STATUS WHERE workspace=$1 AND id = ANY($2)" + params = {"workspace": self.db.workspace, "ids": ids} + + results = await self.db.query(sql, params, True) + + if not results: + return [] + return [ + { + "content": row["content"], + "content_length": row["content_length"], + "content_summary": row["content_summary"], + "status": row["status"], + "chunks_count": row["chunks_count"], + "created_at": row["created_at"], + "updated_at": row["updated_at"], + "file_path": row["file_path"], + } + for row in results + ] async def get_status_counts(self) -> dict[str, int]: """Get counts of documents in each status""" From 486a9e8a529c9e92ab5daf50de1bf0ad0faa16db Mon Sep 17 00:00:00 2001 From: zrguo Date: Thu, 20 Mar 2025 16:29:24 +0800 Subject: [PATCH 05/11] fix index --- lightrag/operate.py | 46 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 36 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index a937ae60..3291c49f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -172,7 +172,7 @@ async def _handle_single_entity_extraction( entity_type=entity_type, description=entity_description, source_id=chunk_key, - metadata={"created_at": time.time(), "file_path": file_path}, + file_path=file_path, ) @@ -201,7 +201,7 @@ async def _handle_single_relationship_extraction( description=edge_description, keywords=edge_keywords, source_id=edge_source_id, - metadata={"created_at": time.time(), "file_path": file_path}, + file_path=file_path, ) @@ -224,9 +224,7 @@ async def _merge_nodes_then_upsert( split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP]) ) already_file_paths.extend( - split_string_by_multi_markers( - already_node["metadata"]["file_path"], [GRAPH_FIELD_SEP] - ) + split_string_by_multi_markers(already_node["file_path"], [GRAPH_FIELD_SEP]) ) already_description.append(already_node["description"]) @@ -244,7 +242,7 @@ async def _merge_nodes_then_upsert( set([dp["source_id"] for dp in nodes_data] + already_source_ids) ) file_path = GRAPH_FIELD_SEP.join( - set([dp["metadata"]["file_path"] for dp in nodes_data] + already_file_paths) + set([dp["file_path"] for dp in nodes_data] + already_file_paths) ) logger.debug(f"file_path: {file_path}") @@ -298,7 +296,7 @@ async def _merge_edges_then_upsert( if already_edge.get("file_path") is not None: already_file_paths.extend( split_string_by_multi_markers( - already_edge["metadata"]["file_path"], [GRAPH_FIELD_SEP] + already_edge["file_path"], [GRAPH_FIELD_SEP] ) ) @@ -340,11 +338,7 @@ async def _merge_edges_then_upsert( ) file_path = GRAPH_FIELD_SEP.join( set( - [ - dp["metadata"]["file_path"] - for dp in edges_data - if dp.get("metadata", {}).get("file_path") - ] + [dp["file_path"] for dp in edges_data if dp.get("file_path")] + already_file_paths ) ) @@ -679,10 +673,6 @@ async def extract_entities( "content": f"{dp['entity_name']}\n{dp['description']}", "source_id": dp["source_id"], "file_path": dp.get("file_path", "unknown_source"), - "metadata": { - "created_at": dp.get("created_at", time.time()), - "file_path": dp.get("file_path", "unknown_source"), - }, } for dp in all_entities_data } @@ -697,10 +687,6 @@ async def extract_entities( "content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}", "source_id": dp["source_id"], "file_path": dp.get("file_path", "unknown_source"), - "metadata": { - "created_at": dp.get("created_at", time.time()), - "file_path": dp.get("file_path", "unknown_source"), - }, } for dp in all_relationships_data } @@ -1285,11 +1271,8 @@ async def _get_node_data( if isinstance(created_at, (int, float)): created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) - # Get file path from metadata or directly from node data + # Get file path from node data file_path = n.get("file_path", "unknown_source") - if not file_path or file_path == "unknown_source": - # Try to get from metadata - file_path = n.get("metadata", {}).get("file_path", "unknown_source") entites_section_list.append( [ @@ -1323,11 +1306,8 @@ async def _get_node_data( if isinstance(created_at, (int, float)): created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) - # Get file path from metadata or directly from edge data + # Get file path from edge data file_path = e.get("file_path", "unknown_source") - if not file_path or file_path == "unknown_source": - # Try to get from metadata - file_path = e.get("metadata", {}).get("file_path", "unknown_source") relations_section_list.append( [ @@ -1564,11 +1544,8 @@ async def _get_edge_data( if isinstance(created_at, (int, float)): created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) - # Get file path from metadata or directly from edge data + # Get file path from edge data file_path = e.get("file_path", "unknown_source") - if not file_path or file_path == "unknown_source": - # Try to get from metadata - file_path = e.get("metadata", {}).get("file_path", "unknown_source") relations_section_list.append( [ @@ -1594,11 +1571,8 @@ async def _get_edge_data( if isinstance(created_at, (int, float)): created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at)) - # Get file path from metadata or directly from node data + # Get file path from node data file_path = n.get("file_path", "unknown_source") - if not file_path or file_path == "unknown_source": - # Try to get from metadata - file_path = n.get("metadata", {}).get("file_path", "unknown_source") entites_section_list.append( [ From f33bcbb32cfa7b4aad8936b564a288018188ecce Mon Sep 17 00:00:00 2001 From: Mario Vignieri Date: Thu, 20 Mar 2025 09:40:56 +0100 Subject: [PATCH 06/11] fix hf_embed torch device use MPS or CPU when CUDA is not available -macos users --- lightrag/llm/hf.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/lightrag/llm/hf.py b/lightrag/llm/hf.py index fb5208b0..954a99b7 100644 --- a/lightrag/llm/hf.py +++ b/lightrag/llm/hf.py @@ -138,16 +138,31 @@ async def hf_model_complete( async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray: - device = next(embed_model.parameters()).device + # Detect the appropriate device + if torch.cuda.is_available(): + device = next(embed_model.parameters()).device # Use CUDA if available + elif torch.backends.mps.is_available(): + device = torch.device("mps") # Use MPS for Apple Silicon + else: + device = torch.device("cpu") # Fallback to CPU + + # Move the model to the detected device + embed_model = embed_model.to(device) + + # Tokenize the input texts and move them to the same device encoded_texts = tokenizer( texts, return_tensors="pt", padding=True, truncation=True ).to(device) + + # Perform inference with torch.no_grad(): outputs = embed_model( input_ids=encoded_texts["input_ids"], attention_mask=encoded_texts["attention_mask"], ) embeddings = outputs.last_hidden_state.mean(dim=1) + + # Convert embeddings to NumPy if embeddings.dtype == torch.bfloat16: return embeddings.detach().to(torch.float32).cpu().numpy() else: From d817defbc434defe9c27af003f01326e2dcd7570 Mon Sep 17 00:00:00 2001 From: zrguo Date: Thu, 20 Mar 2025 22:00:11 +0800 Subject: [PATCH 07/11] fix edit feature --- lightrag/lightrag.py | 66 +++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 485b3d97..5b61cd52 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1947,6 +1947,8 @@ class LightRAG: # 2. Update entity information in the graph new_node_data = {**node_data, **updated_data} + new_node_data["entity_id"] = new_entity_name + if "entity_name" in new_node_data: del new_node_data[ "entity_name" @@ -1963,7 +1965,7 @@ class LightRAG: # Store relationships that need to be updated relations_to_update = [] - + relations_to_delete = [] # Get all edges related to the original entity edges = await self.chunk_entity_relation_graph.get_node_edges( entity_name @@ -1975,6 +1977,12 @@ class LightRAG: source, target ) if edge_data: + relations_to_delete.append( + compute_mdhash_id(source + target, prefix="rel-") + ) + relations_to_delete.append( + compute_mdhash_id(target + source, prefix="rel-") + ) if source == entity_name: await self.chunk_entity_relation_graph.upsert_edge( new_entity_name, target, edge_data @@ -2000,6 +2008,12 @@ class LightRAG: f"Deleted old entity '{entity_name}' and its vector embedding from database" ) + # Delete old relation records from vector database + await self.relationships_vdb.delete(relations_to_delete) + logger.info( + f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database" + ) + # Update relationship vector representations for src, tgt, edge_data in relations_to_update: description = edge_data.get("description", "") @@ -2498,39 +2512,21 @@ class LightRAG: # 4. Get all relationships of the source entities all_relations = [] for entity_name in source_entities: - # Get all relationships where this entity is the source - outgoing_edges = await self.chunk_entity_relation_graph.get_node_edges( + # Get all relationships of the source entities + edges = await self.chunk_entity_relation_graph.get_node_edges( entity_name ) - if outgoing_edges: - for src, tgt in outgoing_edges: + if edges: + for src, tgt in edges: # Ensure src is the current entity if src == entity_name: edge_data = await self.chunk_entity_relation_graph.get_edge( src, tgt ) - all_relations.append(("outgoing", src, tgt, edge_data)) - - # Get all relationships where this entity is the target - incoming_edges = [] - all_labels = await self.chunk_entity_relation_graph.get_all_labels() - for label in all_labels: - if label == entity_name: - continue - node_edges = await self.chunk_entity_relation_graph.get_node_edges( - label - ) - for src, tgt in node_edges or []: - if tgt == entity_name: - incoming_edges.append((src, tgt)) - - for src, tgt in incoming_edges: - edge_data = await self.chunk_entity_relation_graph.get_edge( - src, tgt - ) - all_relations.append(("incoming", src, tgt, edge_data)) + all_relations.append((src, tgt, edge_data)) # 5. Create or update the target entity + merged_entity_data["entity_id"] = target_entity if not target_exists: await self.chunk_entity_relation_graph.upsert_node( target_entity, merged_entity_data @@ -2544,8 +2540,11 @@ class LightRAG: # 6. Recreate all relationships, pointing to the target entity relation_updates = {} # Track relationships that need to be merged + relations_to_delete = [] - for rel_type, src, tgt, edge_data in all_relations: + for src, tgt, edge_data in all_relations: + relations_to_delete.append(compute_mdhash_id(src + tgt, prefix="rel-")) + relations_to_delete.append(compute_mdhash_id(tgt + src, prefix="rel-")) new_src = target_entity if src in source_entities else src new_tgt = target_entity if tgt in source_entities else tgt @@ -2652,18 +2651,11 @@ class LightRAG: entity_id = compute_mdhash_id(entity_name, prefix="ent-") await self.entities_vdb.delete([entity_id]) - # Also ensure any relationships specific to this entity are deleted from vector DB - # This is a safety check, as these should have been transformed to the target entity already - entity_relation_prefix = compute_mdhash_id(entity_name, prefix="rel-") - relations_with_entity = await self.relationships_vdb.search_by_prefix( - entity_relation_prefix + # Delete relationships records from vector database + await self.relationships_vdb.delete(relations_to_delete) + logger.info( + f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database" ) - if relations_with_entity: - relation_ids = [r["id"] for r in relations_with_entity] - await self.relationships_vdb.delete(relation_ids) - logger.info( - f"Deleted {len(relation_ids)} relation records for entity '{entity_name}' from vector database" - ) logger.info( f"Deleted source entity '{entity_name}' and its vector embedding from database" From eb1e954ced2a9472a52533fbf2c7189fd12f1e6c Mon Sep 17 00:00:00 2001 From: zrguo Date: Thu, 20 Mar 2025 22:13:31 +0800 Subject: [PATCH 08/11] fix lint --- lightrag/lightrag.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 5b61cd52..da05bf13 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -2589,6 +2589,12 @@ class LightRAG: f"Created or updated relationship: {rel_data['src']} -> {rel_data['tgt']}" ) + # Delete relationships records from vector database + await self.relationships_vdb.delete(relations_to_delete) + logger.info( + f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database" + ) + # 7. Update entity vector representation description = merged_entity_data.get("description", "") source_id = merged_entity_data.get("source_id", "") @@ -2651,12 +2657,6 @@ class LightRAG: entity_id = compute_mdhash_id(entity_name, prefix="ent-") await self.entities_vdb.delete([entity_id]) - # Delete relationships records from vector database - await self.relationships_vdb.delete(relations_to_delete) - logger.info( - f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database" - ) - logger.info( f"Deleted source entity '{entity_name}' and its vector embedding from database" ) From 9b4f41d8d5c68b1dcfeef52d5c63f40e05954603 Mon Sep 17 00:00:00 2001 From: zrguo Date: Fri, 21 Mar 2025 13:47:53 +0800 Subject: [PATCH 09/11] add embedding_bathc_num and embedding_func_max_async to env --- lightrag/lightrag.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index da05bf13..f053b25e 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -183,10 +183,10 @@ class LightRAG: embedding_func: EmbeddingFunc | None = field(default=None) """Function for computing text embeddings. Must be set before use.""" - embedding_batch_num: int = field(default=32) + embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 32))) """Batch size for embedding computations.""" - embedding_func_max_async: int = field(default=16) + embedding_func_max_async: int = field(default=int(os.getenv("EMBEDDING_FUNC_MAX_ASYNC", 16))) """Maximum number of concurrent embedding function calls.""" embedding_cache_config: dict[str, Any] = field( From 012252fbe212c4b9d74d276fbbd7a7279c9b5ff2 Mon Sep 17 00:00:00 2001 From: zrguo Date: Fri, 21 Mar 2025 13:49:13 +0800 Subject: [PATCH 10/11] Update env.example --- env.example | 2 ++ 1 file changed, 2 insertions(+) diff --git a/env.example b/env.example index 86b4481f..fffa89a4 100644 --- a/env.example +++ b/env.example @@ -73,6 +73,8 @@ LLM_BINDING_HOST=http://localhost:11434 ### Embedding Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal) EMBEDDING_MODEL=bge-m3:latest EMBEDDING_DIM=1024 +EMBEDDING_BATCH_NUM=32 +EMBEDDING_FUNC_MAX_ASYNC=16 # EMBEDDING_BINDING_API_KEY=your_api_key ### ollama example EMBEDDING_BINDING=ollama From ed1f30d79f3d998cc97cae0cc864bac82cf3bf4d Mon Sep 17 00:00:00 2001 From: zrguo Date: Fri, 21 Mar 2025 13:50:46 +0800 Subject: [PATCH 11/11] Update version --- lightrag/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 89475dca..f7dee888 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.2.6" +__version__ = "1.2.7" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG"