From 39633cb1d90cf42b26402c06a6fd8f22b99b3cca Mon Sep 17 00:00:00 2001 From: Rushi Chaganti Date: Wed, 12 Mar 2025 00:04:23 +0530 Subject: [PATCH 01/23] Fixed lint and Added new imports at the top of the file --- README.md | 233 ++++++++++++++++++++++--------- lightrag/lightrag.py | 321 ++++++++++++++++++++++++++++++++++++++++++- requirements.txt | 9 +- 3 files changed, 493 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index 018a94e6..61e7b20f 100644 --- a/README.md +++ b/README.md @@ -37,28 +37,30 @@ This repository hosts the code of LightRAG. The structure of this code is based
+ + +
🎉 News - -- [x] [2025.02.05]🎯📢Our team has released [VideoRAG](https://github.com/HKUDS/VideoRAG) understanding extremely long-context videos. -- [x] [2025.01.13]🎯📢Our team has released [MiniRAG](https://github.com/HKUDS/MiniRAG) making RAG simpler with small models. -- [x] [2025.01.06]🎯📢You can now [use PostgreSQL for Storage](#using-postgresql-for-storage). -- [x] [2024.12.31]🎯📢LightRAG now supports [deletion by document ID](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). -- [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. -- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author. -- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). -- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). -- [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge. -- [x] [2024.11.04]🎯📢You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage). -- [x] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`. -- [x] [2024.10.20]🎯📢We've added a new feature to LightRAG: Graph Visualization. -- [x] [2024.10.18]🎯📢We've added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! -- [x] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)! Welcome to join for sharing and discussions! 🎉🎉 -- [x] [2024.10.16]🎯📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! -- [x] [2024.10.15]🎯📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! +- [X] [2025.02.05]🎯📢Our team has released [VideoRAG](https://github.com/HKUDS/VideoRAG) understanding extremely long-context videos. +- [X] [2025.01.13]🎯📢Our team has released [MiniRAG](https://github.com/HKUDS/MiniRAG) making RAG simpler with small models. +- [X] [2025.01.06]🎯📢You can now [use PostgreSQL for Storage](#using-postgresql-for-storage). +- [X] [2024.12.31]🎯📢LightRAG now supports [deletion by document ID](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). +- [X] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. +- [X] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author. +- [X] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py). +- [X] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). +- [X] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge. +- [X] [2024.11.04]🎯📢You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage). +- [X] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`. +- [X] [2024.10.20]🎯📢We've added a new feature to LightRAG: Graph Visualization. +- [X] [2024.10.18]🎯📢We've added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author! +- [X] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)! Welcome to join for sharing and discussions! 🎉🎉 +- [X] [2024.10.16]🎯📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)! +- [X] [2024.10.15]🎯📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)!
@@ -82,16 +84,20 @@ This repository hosts the code of LightRAG. The structure of this code is based cd LightRAG pip install -e . ``` + * Install from PyPI + ```bash pip install lightrag-hku ``` ## Quick Start + * [Video demo](https://www.youtube.com/watch?v=g21royNJ4fw) of running LightRAG locally. * All the code can be found in the `examples`. * Set OpenAI API key in environment if using OpenAI models: `export OPENAI_API_KEY="sk-...".` * Download the demo text "A Christmas Carol by Charles Dickens": + ```bash curl https://raw.githubusercontent.com/gusye1234/nano-graphrag/main/tests/mock_data.txt > ./book.txt ``` @@ -187,6 +193,7 @@ class QueryParam: Using Open AI-like APIs * LightRAG also supports Open AI-like chat/embeddings APIs: + ```python async def llm_model_func( prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs @@ -225,6 +232,7 @@ async def initialize_rag(): return rag ``` +
@@ -252,12 +260,14 @@ rag = LightRAG( ), ) ``` +
Using Ollama Models ### Overview + If you want to use Ollama models, you need to pull model you plan to use and embedding model, for example `nomic-embed-text`. Then you only need to set LightRAG as follows: @@ -281,31 +291,37 @@ rag = LightRAG( ``` ### Increasing context size + In order for LightRAG to work context should be at least 32k tokens. By default Ollama models have context size of 8k. You can achieve this using one of two ways: #### Increasing the `num_ctx` parameter in Modelfile. 1. Pull the model: + ```bash ollama pull qwen2 ``` 2. Display the model file: + ```bash ollama show --modelfile qwen2 > Modelfile ``` 3. Edit the Modelfile by adding the following line: + ```bash PARAMETER num_ctx 32768 ``` 4. Create the modified model: + ```bash ollama create -f Modelfile qwen2m ``` #### Setup `num_ctx` via Ollama API. + Tiy can use `llm_model_kwargs` param to configure ollama: ```python @@ -325,6 +341,7 @@ rag = LightRAG( ), ) ``` + #### Low RAM GPUs In order to run this experiment on low RAM GPU you should select small model and tune context window (increasing context increase memory consumption). For example, running this ollama example on repurposed mining GPU with 6Gb of RAM required to set context size to 26k while using `gemma2:2b`. It was able to find 197 entities and 19 relations on `book.txt`. @@ -402,6 +419,7 @@ if __name__ == "__main__": ``` #### For detailed documentation and examples, see: + - [LlamaIndex Documentation](lightrag/llm/Readme.md) - [Direct OpenAI Example](examples/lightrag_llamaindex_direct_demo.py) - [LiteLLM Proxy Example](examples/lightrag_llamaindex_litellm_demo.py) @@ -483,13 +501,16 @@ print(response_custom) We've introduced a new function `query_with_separate_keyword_extraction` to enhance the keyword extraction capabilities. This function separates the keyword extraction process from the user's prompt, focusing solely on the query to improve the relevance of extracted keywords. ##### How It Works? + The function operates by dividing the input into two parts: + - `User Query` - `Prompt` It then performs keyword extraction exclusively on the `user query`. This separation ensures that the extraction process is focused and relevant, unaffected by any additional language in the `prompt`. It also allows the `prompt` to serve purely for response formatting, maintaining the intent and clarity of the user's original question. ##### Usage Example + This `example` shows how to tailor the function for educational content, focusing on detailed explanations for older students. ```python @@ -563,6 +584,7 @@ custom_kg = { rag.insert_custom_kg(custom_kg) ``` +
## Insert @@ -593,6 +615,7 @@ rag.insert(["TEXT1", "TEXT2", "TEXT3", ...]) # Documents will be processed in b ``` The `insert_batch_size` parameter in `addon_params` controls how many documents are processed in each batch during insertion. This is useful for: + - Managing memory usage with large document collections - Optimizing processing speed - Providing better progress tracking @@ -647,6 +670,7 @@ text_content = textract.process(file_path) rag.insert(text_content.decode('utf-8')) ``` + ## Storage @@ -685,6 +709,7 @@ async def initialize_rag(): return rag ``` + see test_neo4j.py for a working example. @@ -693,6 +718,7 @@ see test_neo4j.py for a working example. Using PostgreSQL for Storage For production level scenarios you will most likely want to leverage an enterprise solution. PostgreSQL can provide a one-stop solution for you as KV store, VectorDB (pgvector) and GraphDB (apache AGE). + * PostgreSQL is lightweight,the whole binary distribution including all necessary plugins can be zipped to 40MB: Ref to [Windows Release](https://github.com/ShanGor/apache-age-windows/releases/tag/PG17%2Fv1.5.0-rc0) as it is easy to install for Linux/Mac. * If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag * How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py) @@ -735,6 +761,7 @@ For production level scenarios you will most likely want to leverage an enterpri > It is a known issue of the release version: https://github.com/apache/age/pull/1721 > > You can Compile the AGE from source code and fix it. + > @@ -742,9 +769,11 @@ For production level scenarios you will most likely want to leverage an enterpri Using Faiss for Storage - Install the required dependencies: + ``` pip install faiss-cpu ``` + You can also install `faiss-gpu` if you have GPU support. - Here we are using `sentence-transformers` but you can also use `OpenAIEmbedding` model with `3072` dimensions. @@ -810,6 +839,7 @@ relation = rag.create_relation("Google", "Gmail", { "weight": 2.0 }) ``` +
@@ -835,6 +865,7 @@ updated_relation = rag.edit_relation("Google", "Google Mail", { "weight": 3.0 }) ``` +
All operations are available in both synchronous and asynchronous versions. The asynchronous versions have the prefix "a" (e.g., `acreate_entity`, `aedit_relation`). @@ -851,6 +882,55 @@ All operations are available in both synchronous and asynchronous versions. The These operations maintain data consistency across both the graph database and vector database components, ensuring your knowledge graph remains coherent. +## Data Export Functions + +## Overview + +LightRAG allows you to export your knowledge graph data in various formats for analysis, sharing, and backup purposes. The system supports exporting entities, relations, and relationship data. + +## Export Functions + +### Basic Usage + +```python +# Basic CSV export (default format) +rag.export_data("knowledge_graph.csv") + +# Specify any format +rag.export_data("output.xlsx", file_format="excel") +``` + +### Different File Formats supported + +```python +#Export data in CSV format +rag.export_data("graph_data.csv", file_format="csv") + +# Export data in Excel sheet +rag.export_data("graph_data.xlsx", file_format="excel") + +# Export data in markdown format +rag.export_data("graph_data.md", file_format="md") + +# Export data in Text +rag.export_data("graph_data.txt", file_format="txt") +``` +## Additional Options + +Include vector embeddings in the export (optional): + +```python +rag.export_data("complete_data.csv", include_vector_data=True) +``` +## Data Included in Export + +All exports include: + +* Entity information (names, IDs, metadata) +* Relation data (connections between entities) +* Relationship information from vector database + + ## Entity Merging
@@ -913,6 +993,7 @@ rag.merge_entities( ``` When merging entities: + * All relationships from source entities are redirected to the target entity * Duplicate relationships are intelligently merged * Self-relationships (loops) are prevented @@ -946,6 +1027,7 @@ rag.clear_cache(modes=["local"]) ``` Valid modes are: + - `"default"`: Extraction cache - `"naive"`: Naive search cache - `"local"`: Local search cache @@ -960,33 +1042,33 @@ Valid modes are:
Parameters -| **Parameter** | **Type** | **Explanation** | **Default** | -|----------------------------------------------| --- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------| -| **working\_dir** | `str` | Directory where the cache will be stored | `lightrag_cache+timestamp` | -| **kv\_storage** | `str` | Storage type for documents and text chunks. Supported types: `JsonKVStorage`, `OracleKVStorage` | `JsonKVStorage` | -| **vector\_storage** | `str` | Storage type for embedding vectors. Supported types: `NanoVectorDBStorage`, `OracleVectorDBStorage` | `NanoVectorDBStorage` | -| **graph\_storage** | `str` | Storage type for graph edges and nodes. Supported types: `NetworkXStorage`, `Neo4JStorage`, `OracleGraphStorage` | `NetworkXStorage` | -| **chunk\_token\_size** | `int` | Maximum token size per chunk when splitting documents | `1200` | -| **chunk\_overlap\_token\_size** | `int` | Overlap token size between two chunks when splitting documents | `100` | -| **tiktoken\_model\_name** | `str` | Model name for the Tiktoken encoder used to calculate token numbers | `gpt-4o-mini` | -| **entity\_extract\_max\_gleaning** | `int` | Number of loops in the entity extraction process, appending history messages | `1` | -| **entity\_summary\_to\_max\_tokens** | `int` | Maximum token size for each entity summary | `500` | -| **node\_embedding\_algorithm** | `str` | Algorithm for node embedding (currently not used) | `node2vec` | -| **node2vec\_params** | `dict` | Parameters for node embedding | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` | -| **embedding\_func** | `EmbeddingFunc` | Function to generate embedding vectors from text | `openai_embed` | -| **embedding\_batch\_num** | `int` | Maximum batch size for embedding processes (multiple texts sent per batch) | `32` | -| **embedding\_func\_max\_async** | `int` | Maximum number of concurrent asynchronous embedding processes | `16` | -| **llm\_model\_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` | -| **llm\_model\_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` | -| **llm\_model\_max\_token\_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`(default value changed by env var MAX_TOKENS) | -| **llm\_model\_max\_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `16`(default value changed by env var MAX_ASYNC) | -| **llm\_model\_kwargs** | `dict` | Additional parameters for LLM generation | | -| **vector\_db\_storage\_cls\_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval. | cosine_better_than_threshold: 0.2(default value changed by env var COSINE_THRESHOLD) | -| **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` | -| **enable\_llm\_cache\_for\_entity\_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `TRUE` | -| **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"], "insert_batch_size": 10}`: sets example limit, output language, and batch size for document processing | `example_number: all examples, language: English, insert_batch_size: 10` | -| **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` | -| **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains three parameters:
- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.
- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` | +| **Parameter** | **Type** | **Explanation** | **Default** | +| -------------------------------------------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- | +| **working\_dir** | `str` | Directory where the cache will be stored | `lightrag_cache+timestamp` | +| **kv\_storage** | `str` | Storage type for documents and text chunks. Supported types:`JsonKVStorage`, `OracleKVStorage` | `JsonKVStorage` | +| **vector\_storage** | `str` | Storage type for embedding vectors. Supported types:`NanoVectorDBStorage`, `OracleVectorDBStorage` | `NanoVectorDBStorage` | +| **graph\_storage** | `str` | Storage type for graph edges and nodes. Supported types:`NetworkXStorage`, `Neo4JStorage`, `OracleGraphStorage` | `NetworkXStorage` | +| **chunk\_token\_size** | `int` | Maximum token size per chunk when splitting documents | `1200` | +| **chunk\_overlap\_token\_size** | `int` | Overlap token size between two chunks when splitting documents | `100` | +| **tiktoken\_model\_name** | `str` | Model name for the Tiktoken encoder used to calculate token numbers | `gpt-4o-mini` | +| **entity\_extract\_max\_gleaning** | `int` | Number of loops in the entity extraction process, appending history messages | `1` | +| **entity\_summary\_to\_max\_tokens** | `int` | Maximum token size for each entity summary | `500` | +| **node\_embedding\_algorithm** | `str` | Algorithm for node embedding (currently not used) | `node2vec` | +| **node2vec\_params** | `dict` | Parameters for node embedding | `{"dimensions": 1536,"num_walks": 10,"walk_length": 40,"window_size": 2,"iterations": 3,"random_seed": 3,}` | +| **embedding\_func** | `EmbeddingFunc` | Function to generate embedding vectors from text | `openai_embed` | +| **embedding\_batch\_num** | `int` | Maximum batch size for embedding processes (multiple texts sent per batch) | `32` | +| **embedding\_func\_max\_async** | `int` | Maximum number of concurrent asynchronous embedding processes | `16` | +| **llm\_model\_func** | `callable` | Function for LLM generation | `gpt_4o_mini_complete` | +| **llm\_model\_name** | `str` | LLM model name for generation | `meta-llama/Llama-3.2-1B-Instruct` | +| **llm\_model\_max\_token\_size** | `int` | Maximum token size for LLM generation (affects entity relation summaries) | `32768`(default value changed by env var MAX_TOKENS) | +| **llm\_model\_max\_async** | `int` | Maximum number of concurrent asynchronous LLM processes | `16`(default value changed by env var MAX_ASYNC) | +| **llm\_model\_kwargs** | `dict` | Additional parameters for LLM generation | | +| **vector\_db\_storage\_cls\_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval. | cosine_better_than_threshold: 0.2(default value changed by env var COSINE_THRESHOLD) | +| **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` | +| **enable\_llm\_cache\_for\_entity\_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `TRUE` | +| **addon\_params** | `dict` | Additional parameters, e.g.,`{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"], "insert_batch_size": 10}`: sets example limit, output language, and batch size for document processing | `example_number: all examples, language: English, insert_batch_size: 10` | +| **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` | +| **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains three parameters:`
`- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.`
`- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.`
`- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default:`{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` |
@@ -996,12 +1078,15 @@ Valid modes are: Click to view error handling details The API includes comprehensive error handling: + - File not found errors (404) - Processing errors (500) - Supports multiple file encodings (UTF-8 and GBK) +
## API + LightRag can be installed with API support to serve a Fast api interface to perform data upload and indexing/Rag operations/Rescan of the input folder etc.. [LightRag API](lightrag/api/README.md) @@ -1035,7 +1120,6 @@ net.show('knowledge_graph.html')
Graph visualization with Neo4 - * The following code can be found in `examples/graph_visual_with_neo4j.py` ```python @@ -1171,10 +1255,13 @@ LightRag can be installed with Tools support to add extra tools like the graphml
## Evaluation + ### Dataset + The dataset used in LightRAG can be downloaded from [TommyChien/UltraDomain](https://huggingface.co/datasets/TommyChien/UltraDomain). ### Generate Query + LightRAG uses the following prompt to generate high-level queries, with the corresponding code in `example/generate_query.py`.
@@ -1203,9 +1290,11 @@ Output the results in the following structure: - User 5: [user description] ... ``` +
### Batch Eval + To evaluate the performance of two RAG systems on high-level queries, LightRAG uses the following prompt, with the specific code available in `example/batch_eval.py`.
@@ -1253,37 +1342,40 @@ Output your evaluation in the following JSON format: }} }} ``` +
### Overall Performance Table -| | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | -|----------------------|-------------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------| -| | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | -| **Comprehensiveness** | 32.4% | **67.6%** | 38.4% | **61.6%** | 16.4% | **83.6%** | 38.8% | **61.2%** | -| **Diversity** | 23.6% | **76.4%** | 38.0% | **62.0%** | 13.6% | **86.4%** | 32.4% | **67.6%** | -| **Empowerment** | 32.4% | **67.6%** | 38.8% | **61.2%** | 16.4% | **83.6%** | 42.8% | **57.2%** | -| **Overall** | 32.4% | **67.6%** | 38.8% | **61.2%** | 15.2% | **84.8%** | 40.0% | **60.0%** | -| | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | -| **Comprehensiveness** | 31.6% | **68.4%** | 38.8% | **61.2%** | 15.2% | **84.8%** | 39.2% | **60.8%** | -| **Diversity** | 29.2% | **70.8%** | 39.2% | **60.8%** | 11.6% | **88.4%** | 30.8% | **69.2%** | -| **Empowerment** | 31.6% | **68.4%** | 36.4% | **63.6%** | 15.2% | **84.8%** | 42.4% | **57.6%** | -| **Overall** | 32.4% | **67.6%** | 38.0% | **62.0%** | 14.4% | **85.6%** | 40.0% | **60.0%** | -| | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | -| **Comprehensiveness** | 26.0% | **74.0%** | 41.6% | **58.4%** | 26.8% | **73.2%** | 40.4% | **59.6%** | -| **Diversity** | 24.0% | **76.0%** | 38.8% | **61.2%** | 20.0% | **80.0%** | 32.4% | **67.6%** | -| **Empowerment** | 25.2% | **74.8%** | 40.8% | **59.2%** | 26.0% | **74.0%** | 46.0% | **54.0%** | -| **Overall** | 24.8% | **75.2%** | 41.6% | **58.4%** | 26.4% | **73.6%** | 42.4% | **57.6%** | -| | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | -| **Comprehensiveness** | 45.6% | **54.4%** | 48.4% | **51.6%** | 48.4% | **51.6%** | **50.4%** | 49.6% | -| **Diversity** | 22.8% | **77.2%** | 40.8% | **59.2%** | 26.4% | **73.6%** | 36.0% | **64.0%** | -| **Empowerment** | 41.2% | **58.8%** | 45.2% | **54.8%** | 43.6% | **56.4%** | **50.8%** | 49.2% | -| **Overall** | 45.2% | **54.8%** | 48.0% | **52.0%** | 47.2% | **52.8%** | **50.4%** | 49.6% | +| | **Agriculture** | | **CS** | | **Legal** | | **Mix** | | +| --------------------------- | --------------------- | ------------------ | ------------ | ------------------ | --------------- | ------------------ | --------------- | ------------------ | +| | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | NaiveRAG | **LightRAG** | +| **Comprehensiveness** | 32.4% | **67.6%** | 38.4% | **61.6%** | 16.4% | **83.6%** | 38.8% | **61.2%** | +| **Diversity** | 23.6% | **76.4%** | 38.0% | **62.0%** | 13.6% | **86.4%** | 32.4% | **67.6%** | +| **Empowerment** | 32.4% | **67.6%** | 38.8% | **61.2%** | 16.4% | **83.6%** | 42.8% | **57.2%** | +| **Overall** | 32.4% | **67.6%** | 38.8% | **61.2%** | 15.2% | **84.8%** | 40.0% | **60.0%** | +| | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | RQ-RAG | **LightRAG** | +| **Comprehensiveness** | 31.6% | **68.4%** | 38.8% | **61.2%** | 15.2% | **84.8%** | 39.2% | **60.8%** | +| **Diversity** | 29.2% | **70.8%** | 39.2% | **60.8%** | 11.6% | **88.4%** | 30.8% | **69.2%** | +| **Empowerment** | 31.6% | **68.4%** | 36.4% | **63.6%** | 15.2% | **84.8%** | 42.4% | **57.6%** | +| **Overall** | 32.4% | **67.6%** | 38.0% | **62.0%** | 14.4% | **85.6%** | 40.0% | **60.0%** | +| | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | HyDE | **LightRAG** | +| **Comprehensiveness** | 26.0% | **74.0%** | 41.6% | **58.4%** | 26.8% | **73.2%** | 40.4% | **59.6%** | +| **Diversity** | 24.0% | **76.0%** | 38.8% | **61.2%** | 20.0% | **80.0%** | 32.4% | **67.6%** | +| **Empowerment** | 25.2% | **74.8%** | 40.8% | **59.2%** | 26.0% | **74.0%** | 46.0% | **54.0%** | +| **Overall** | 24.8% | **75.2%** | 41.6% | **58.4%** | 26.4% | **73.6%** | 42.4% | **57.6%** | +| | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | GraphRAG | **LightRAG** | +| **Comprehensiveness** | 45.6% | **54.4%** | 48.4% | **51.6%** | 48.4% | **51.6%** | **50.4%** | 49.6% | +| **Diversity** | 22.8% | **77.2%** | 40.8% | **59.2%** | 26.4% | **73.6%** | 36.0% | **64.0%** | +| **Empowerment** | 41.2% | **58.8%** | 45.2% | **54.8%** | 43.6% | **56.4%** | **50.8%** | 49.2% | +| **Overall** | 45.2% | **54.8%** | 48.0% | **52.0%** | 47.2% | **52.8%** | **50.4%** | 49.6% | ## Reproduce + All the code can be found in the `./reproduce` directory. ### Step-0 Extract Unique Contexts + First, we need to extract unique contexts in the datasets.
@@ -1340,9 +1432,11 @@ def extract_unique_contexts(input_directory, output_directory): print("All files have been processed.") ``` +
### Step-1 Insert Contexts + For the extracted contexts, we insert them into the LightRAG system.
@@ -1366,6 +1460,7 @@ def insert_text(rag, file_path): if retries == max_retries: print("Insertion failed after exceeding the maximum number of retries") ``` +
### Step-2 Generate Queries @@ -1390,9 +1485,11 @@ def get_summary(context, tot_tokens=2000): return summary ``` + ### Step-3 Query + For the queries generated in Step-2, we will extract them and query LightRAG.
@@ -1409,6 +1506,7 @@ def extract_queries(file_path): return queries ``` +
## Star History @@ -1441,4 +1539,5 @@ archivePrefix={arXiv}, primaryClass={cs.IR} } ``` + **Thank you for your interest in our work!** diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 3a5e4e84..4f374890 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -3,11 +3,14 @@ from __future__ import annotations import asyncio import configparser import os +import csv import warnings from dataclasses import asdict, dataclass, field from datetime import datetime from functools import partial -from typing import Any, AsyncIterator, Callable, Iterator, cast, final +from typing import Any, AsyncIterator, Callable, Iterator, cast, final, Literal +import pandas as pd + from lightrag.kg import ( STORAGE_ENV_REQUIREMENTS, @@ -2592,6 +2595,322 @@ class LightRAG: logger.error(f"Error merging entities: {e}") raise + async def aexport_data( + self, + output_path: str, + file_format: Literal["csv", "excel", "md", "txt"] = "csv", + include_vector_data: bool = False, + ) -> None: + """ + Asynchronously exports all entities, relations, and relationships to various formats. + Args: + output_path: The path to the output file (including extension). + file_format: Output format - "csv", "excel", "md", "txt". + - csv: Comma-separated values file + - excel: Microsoft Excel file with multiple sheets + - md: Markdown tables + - txt: Plain text formatted output + - table: Print formatted tables to console + include_vector_data: Whether to include data from the vector database. + """ + # Collect data + entities_data = [] + relations_data = [] + relationships_data = [] + + # --- Entities --- + all_entities = await self.chunk_entity_relation_graph.get_all_labels() + for entity_name in all_entities: + entity_info = await self.get_entity_info( + entity_name, include_vector_data=include_vector_data + ) + entity_row = { + "entity_name": entity_name, + "source_id": entity_info["source_id"], + "graph_data": str( + entity_info["graph_data"] + ), # Convert to string to ensure compatibility + } + if include_vector_data and "vector_data" in entity_info: + entity_row["vector_data"] = str(entity_info["vector_data"]) + entities_data.append(entity_row) + + # --- Relations --- + for src_entity in all_entities: + for tgt_entity in all_entities: + if src_entity == tgt_entity: + continue + + edge_exists = await self.chunk_entity_relation_graph.has_edge( + src_entity, tgt_entity + ) + if edge_exists: + relation_info = await self.get_relation_info( + src_entity, tgt_entity, include_vector_data=include_vector_data + ) + relation_row = { + "src_entity": src_entity, + "tgt_entity": tgt_entity, + "source_id": relation_info["source_id"], + "graph_data": str( + relation_info["graph_data"] + ), # Convert to string + } + if include_vector_data and "vector_data" in relation_info: + relation_row["vector_data"] = str(relation_info["vector_data"]) + relations_data.append(relation_row) + + # --- Relationships (from VectorDB) --- + all_relationships = await self.relationships_vdb.client_storage + for rel in all_relationships["data"]: + relationships_data.append( + { + "relationship_id": rel["__id__"], + "data": str(rel), # Convert to string for compatibility + } + ) + + # Export based on format + if file_format == "csv": + # CSV export + with open(output_path, "w", newline="", encoding="utf-8") as csvfile: + # Entities + if entities_data: + csvfile.write("# ENTITIES\n") + writer = csv.DictWriter(csvfile, fieldnames=entities_data[0].keys()) + writer.writeheader() + writer.writerows(entities_data) + csvfile.write("\n\n") + + # Relations + if relations_data: + csvfile.write("# RELATIONS\n") + writer = csv.DictWriter( + csvfile, fieldnames=relations_data[0].keys() + ) + writer.writeheader() + writer.writerows(relations_data) + csvfile.write("\n\n") + + # Relationships + if relationships_data: + csvfile.write("# RELATIONSHIPS\n") + writer = csv.DictWriter( + csvfile, fieldnames=relationships_data[0].keys() + ) + writer.writeheader() + writer.writerows(relationships_data) + + elif file_format == "excel": + # Excel export + entities_df = ( + pd.DataFrame(entities_data) if entities_data else pd.DataFrame() + ) + relations_df = ( + pd.DataFrame(relations_data) if relations_data else pd.DataFrame() + ) + relationships_df = ( + pd.DataFrame(relationships_data) + if relationships_data + else pd.DataFrame() + ) + + with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer: + if not entities_df.empty: + entities_df.to_excel(writer, sheet_name="Entities", index=False) + if not relations_df.empty: + relations_df.to_excel(writer, sheet_name="Relations", index=False) + if not relationships_df.empty: + relationships_df.to_excel( + writer, sheet_name="Relationships", index=False + ) + + elif file_format == "md": + # Markdown export + with open(output_path, "w", encoding="utf-8") as mdfile: + mdfile.write("# LightRAG Data Export\n\n") + + # Entities + mdfile.write("## Entities\n\n") + if entities_data: + # Write header + mdfile.write("| " + " | ".join(entities_data[0].keys()) + " |\n") + mdfile.write( + "| " + + " | ".join(["---"] * len(entities_data[0].keys())) + + " |\n" + ) + + # Write rows + for entity in entities_data: + mdfile.write( + "| " + " | ".join(str(v) for v in entity.values()) + " |\n" + ) + mdfile.write("\n\n") + else: + mdfile.write("*No entity data available*\n\n") + + # Relations + mdfile.write("## Relations\n\n") + if relations_data: + # Write header + mdfile.write("| " + " | ".join(relations_data[0].keys()) + " |\n") + mdfile.write( + "| " + + " | ".join(["---"] * len(relations_data[0].keys())) + + " |\n" + ) + + # Write rows + for relation in relations_data: + mdfile.write( + "| " + + " | ".join(str(v) for v in relation.values()) + + " |\n" + ) + mdfile.write("\n\n") + else: + mdfile.write("*No relation data available*\n\n") + + # Relationships + mdfile.write("## Relationships\n\n") + if relationships_data: + # Write header + mdfile.write( + "| " + " | ".join(relationships_data[0].keys()) + " |\n" + ) + mdfile.write( + "| " + + " | ".join(["---"] * len(relationships_data[0].keys())) + + " |\n" + ) + + # Write rows + for relationship in relationships_data: + mdfile.write( + "| " + + " | ".join(str(v) for v in relationship.values()) + + " |\n" + ) + else: + mdfile.write("*No relationship data available*\n\n") + + elif file_format == "txt": + # Plain text export + with open(output_path, "w", encoding="utf-8") as txtfile: + txtfile.write("LIGHTRAG DATA EXPORT\n") + txtfile.write("=" * 80 + "\n\n") + + # Entities + txtfile.write("ENTITIES\n") + txtfile.write("-" * 80 + "\n") + if entities_data: + # Create fixed width columns + col_widths = { + k: max(len(k), max(len(str(e[k])) for e in entities_data)) + for k in entities_data[0] + } + header = " ".join(k.ljust(col_widths[k]) for k in entities_data[0]) + txtfile.write(header + "\n") + txtfile.write("-" * len(header) + "\n") + + # Write rows + for entity in entities_data: + row = " ".join( + str(v).ljust(col_widths[k]) for k, v in entity.items() + ) + txtfile.write(row + "\n") + txtfile.write("\n\n") + else: + txtfile.write("No entity data available\n\n") + + # Relations + txtfile.write("RELATIONS\n") + txtfile.write("-" * 80 + "\n") + if relations_data: + # Create fixed width columns + col_widths = { + k: max(len(k), max(len(str(r[k])) for r in relations_data)) + for k in relations_data[0] + } + header = " ".join( + k.ljust(col_widths[k]) for k in relations_data[0] + ) + txtfile.write(header + "\n") + txtfile.write("-" * len(header) + "\n") + + # Write rows + for relation in relations_data: + row = " ".join( + str(v).ljust(col_widths[k]) for k, v in relation.items() + ) + txtfile.write(row + "\n") + txtfile.write("\n\n") + else: + txtfile.write("No relation data available\n\n") + + # Relationships + txtfile.write("RELATIONSHIPS\n") + txtfile.write("-" * 80 + "\n") + if relationships_data: + # Create fixed width columns + col_widths = { + k: max(len(k), max(len(str(r[k])) for r in relationships_data)) + for k in relationships_data[0] + } + header = " ".join( + k.ljust(col_widths[k]) for k in relationships_data[0] + ) + txtfile.write(header + "\n") + txtfile.write("-" * len(header) + "\n") + + # Write rows + for relationship in relationships_data: + row = " ".join( + str(v).ljust(col_widths[k]) for k, v in relationship.items() + ) + txtfile.write(row + "\n") + else: + txtfile.write("No relationship data available\n\n") + + else: + raise ValueError( + f"Unsupported file format: {file_format}. " + f"Choose from: csv, excel, md, txt" + ) + if file_format is not None: + print(f"Data exported to: {output_path} with format: {file_format}") + else: + print("Data displayed as table format") + + def export_data( + self, + output_path: str, + file_format: Literal["csv", "excel", "md", "txt"] = "csv", + include_vector_data: bool = False, + ) -> None: + """ + Synchronously exports all entities, relations, and relationships to various formats. + Args: + output_path: The path to the output file (including extension). + file_format: Output format - "csv", "excel", "md", "txt". + - csv: Comma-separated values file + - excel: Microsoft Excel file with multiple sheets + - md: Markdown tables + - txt: Plain text formatted output + - table: Print formatted tables to console + include_vector_data: Whether to include data from the vector database. + """ + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + loop.run_until_complete( + self.aexport_data(output_path, file_format, include_vector_data) + ) + def merge_entities( self, source_entities: list[str], diff --git a/requirements.txt b/requirements.txt index d9a5c68e..d35167fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,12 @@ future # Basic modules gensim + +# Additional Packages for export Functionality +pandas>=2.0.0 + +# Extra libraries are installed when needed using pipmaster + pipmaster pydantic python-dotenv @@ -13,5 +19,4 @@ tenacity # LLM packages tiktoken - -# Extra libraries are installed when needed using pipmaster +xlsxwriter>=3.1.0 From ef754a43002cddfa3da036d95445e8928552e778 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 12:38:33 +0800 Subject: [PATCH 02/23] Explicitly set API docs and schema URLs. - Set OpenAPI schema URL to `/openapi.json` - Set docs URL to `/docs` - Set redoc URL to `/redoc` - Update Vite config for API docs routing - Ensure proper path handling for docs endpoints --- lightrag/api/lightrag_server.py | 3 +++ lightrag_webui/vite.config.ts | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index fd09a691..f4b21e66 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -177,6 +177,9 @@ def create_app(args): if api_key else "", version=__api_version__, + openapi_url="/openapi.json", # Explicitly set OpenAPI schema URL + docs_url="/docs", # Explicitly set docs URL + redoc_url="/redoc", # Explicitly set redoc URL openapi_tags=[{"name": "api"}], lifespan=lifespan, ) diff --git a/lightrag_webui/vite.config.ts b/lightrag_webui/vite.config.ts index 09ecd7ea..b05bf2fa 100644 --- a/lightrag_webui/vite.config.ts +++ b/lightrag_webui/vite.config.ts @@ -26,7 +26,9 @@ export default defineConfig({ target: import.meta.env.VITE_BACKEND_URL || 'http://localhost:9621', changeOrigin: true, rewrite: endpoint === '/api' ? - (path) => path.replace(/^\/api/, '') : undefined + (path) => path.replace(/^\/api/, '') : + endpoint === '/docs' || endpoint === '/openapi.json' ? + (path) => path : undefined } ]) ) : {} From 002d675dc6565d84311e756d771c069a4540a1ed Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 12:38:33 +0800 Subject: [PATCH 03/23] Explicitly set API docs and schema URLs. - Set OpenAPI schema URL to `/openapi.json` - Set docs URL to `/docs` - Set redoc URL to `/redoc` - Update Vite config for API docs routing - Ensure proper path handling for docs endpoints --- lightrag/api/lightrag_server.py | 3 +++ lightrag_webui/vite.config.ts | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 4c75430f..9267c370 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -177,6 +177,9 @@ def create_app(args): if api_key else "", version=__api_version__, + openapi_url="/openapi.json", # Explicitly set OpenAPI schema URL + docs_url="/docs", # Explicitly set docs URL + redoc_url="/redoc", # Explicitly set redoc URL openapi_tags=[{"name": "api"}], lifespan=lifespan, ) diff --git a/lightrag_webui/vite.config.ts b/lightrag_webui/vite.config.ts index 09ecd7ea..b05bf2fa 100644 --- a/lightrag_webui/vite.config.ts +++ b/lightrag_webui/vite.config.ts @@ -26,7 +26,9 @@ export default defineConfig({ target: import.meta.env.VITE_BACKEND_URL || 'http://localhost:9621', changeOrigin: true, rewrite: endpoint === '/api' ? - (path) => path.replace(/^\/api/, '') : undefined + (path) => path.replace(/^\/api/, '') : + endpoint === '/docs' || endpoint === '/openapi.json' ? + (path) => path : undefined } ]) ) : {} From 20b8a9d7e9822fe1878cedee0dfd2393e73bb33f Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 13:00:03 +0800 Subject: [PATCH 04/23] Remove redundant label fetch on mount --- lightrag_webui/src/hooks/useLightragGraph.tsx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lightrag_webui/src/hooks/useLightragGraph.tsx b/lightrag_webui/src/hooks/useLightragGraph.tsx index dc3294cf..e2ce5943 100644 --- a/lightrag_webui/src/hooks/useLightragGraph.tsx +++ b/lightrag_webui/src/hooks/useLightragGraph.tsx @@ -169,11 +169,6 @@ const useLightrangeGraph = () => { const minDegree = useSettingsStore.use.graphMinDegree() const isFetching = useGraphStore.use.isFetching() - // Fetch all database labels on mount - useEffect(() => { - useGraphStore.getState().fetchAllDatabaseLabels() - }, []) - // Use ref to track fetch status const fetchStatusRef = useRef>({}); @@ -276,6 +271,7 @@ const useLightrangeGraph = () => { const state = useGraphStore.getState() state.reset() state.setSigmaGraph(new DirectedGraph()) + state.setGraphLabels(['*']) } }, [queryLabel, maxQueryDepth, minDegree, isFetching]) From d28a94d55ddd6e29f3a544464d58ded9b460a8a9 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 15:04:42 +0800 Subject: [PATCH 05/23] Refactor AsyncSearch component for better key management --- lightrag_webui/src/components/ui/AsyncSearch.tsx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightrag_webui/src/components/ui/AsyncSearch.tsx b/lightrag_webui/src/components/ui/AsyncSearch.tsx index 339c0313..b1c25fe9 100644 --- a/lightrag_webui/src/components/ui/AsyncSearch.tsx +++ b/lightrag_webui/src/components/ui/AsyncSearch.tsx @@ -1,4 +1,4 @@ -import { useState, useEffect, useCallback } from 'react' +import React, { useState, useEffect, useCallback } from 'react' import { Loader2 } from 'lucide-react' import { useDebounce } from '@/hooks/useDebounce' @@ -204,7 +204,7 @@ export function AsyncSearch({ ))} {options.map((option, idx) => ( - <> + ({ {renderOption(option)} {idx !== options.length - 1 && ( -
+
)} - + ))} From 3b6fabca0efa40939a3ddf7ecda0e0e1c548a10a Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 15:15:42 +0800 Subject: [PATCH 06/23] Added tab visibility context and provider for dynamic tab management - Introduced TabVisibilityProvider component - Created TabContent for conditional rendering - Added context and hooks for tab visibility - Updated DocumentManager dependencies - Integrated provider in App component --- lightrag_webui/src/App.tsx | 57 ++++++++++--------- .../src/components/ui/TabContent.tsx | 39 +++++++++++++ .../src/contexts/TabVisibilityProvider.tsx | 38 +++++++++++++ lightrag_webui/src/contexts/context.ts | 12 ++++ lightrag_webui/src/contexts/types.ts | 5 ++ .../src/contexts/useTabVisibility.ts | 17 ++++++ .../src/features/DocumentManager.tsx | 8 +-- 7 files changed, 145 insertions(+), 31 deletions(-) create mode 100644 lightrag_webui/src/components/ui/TabContent.tsx create mode 100644 lightrag_webui/src/contexts/TabVisibilityProvider.tsx create mode 100644 lightrag_webui/src/contexts/context.ts create mode 100644 lightrag_webui/src/contexts/types.ts create mode 100644 lightrag_webui/src/contexts/useTabVisibility.ts diff --git a/lightrag_webui/src/App.tsx b/lightrag_webui/src/App.tsx index 1cf8c5e3..bb4a84cb 100644 --- a/lightrag_webui/src/App.tsx +++ b/lightrag_webui/src/App.tsx @@ -1,5 +1,6 @@ import { useState, useCallback } from 'react' import ThemeProvider from '@/components/ThemeProvider' +import TabVisibilityProvider from '@/contexts/TabVisibilityProvider' import MessageAlert from '@/components/MessageAlert' import ApiKeyAlert from '@/components/ApiKeyAlert' import StatusIndicator from '@/components/graph/StatusIndicator' @@ -54,33 +55,35 @@ function App() { return ( -
- - -
- - - - - - - - - - - - -
-
- {enableHealthCheck && } - {message !== null && !apiKeyInvalid && } - {apiKeyInvalid && } - -
+ +
+ + +
+ + + + + + + + + + + + +
+
+ {enableHealthCheck && } + {message !== null && !apiKeyInvalid && } + {apiKeyInvalid && } + +
+
) } diff --git a/lightrag_webui/src/components/ui/TabContent.tsx b/lightrag_webui/src/components/ui/TabContent.tsx new file mode 100644 index 00000000..f3c0b80f --- /dev/null +++ b/lightrag_webui/src/components/ui/TabContent.tsx @@ -0,0 +1,39 @@ +import React, { useEffect } from 'react'; +import { useTabVisibility } from '@/contexts/useTabVisibility'; + +interface TabContentProps { + tabId: string; + children: React.ReactNode; + className?: string; +} + +/** + * TabContent component that manages visibility based on tab selection + * Works with the TabVisibilityContext to show/hide content based on active tab + */ +const TabContent: React.FC = ({ tabId, children, className = '' }) => { + const { isTabVisible, setTabVisibility } = useTabVisibility(); + const isVisible = isTabVisible(tabId); + + // Register this tab with the context when mounted + useEffect(() => { + setTabVisibility(tabId, true); + + // Cleanup when unmounted + return () => { + setTabVisibility(tabId, false); + }; + }, [tabId, setTabVisibility]); + + if (!isVisible) { + return null; + } + + return ( +
+ {children} +
+ ); +}; + +export default TabContent; diff --git a/lightrag_webui/src/contexts/TabVisibilityProvider.tsx b/lightrag_webui/src/contexts/TabVisibilityProvider.tsx new file mode 100644 index 00000000..73be2f64 --- /dev/null +++ b/lightrag_webui/src/contexts/TabVisibilityProvider.tsx @@ -0,0 +1,38 @@ +import React, { useState, useMemo } from 'react'; +import { TabVisibilityContext } from './context'; +import { TabVisibilityContextType } from './types'; + +interface TabVisibilityProviderProps { + children: React.ReactNode; +} + +/** + * Provider component for the TabVisibility context + * Manages the visibility state of tabs throughout the application + */ +export const TabVisibilityProvider: React.FC = ({ children }) => { + const [visibleTabs, setVisibleTabs] = useState>({}); + + // Create the context value with memoization to prevent unnecessary re-renders + const contextValue = useMemo( + () => ({ + visibleTabs, + setTabVisibility: (tabId: string, isVisible: boolean) => { + setVisibleTabs((prev) => ({ + ...prev, + [tabId]: isVisible, + })); + }, + isTabVisible: (tabId: string) => !!visibleTabs[tabId], + }), + [visibleTabs] + ); + + return ( + + {children} + + ); +}; + +export default TabVisibilityProvider; diff --git a/lightrag_webui/src/contexts/context.ts b/lightrag_webui/src/contexts/context.ts new file mode 100644 index 00000000..e6b569a2 --- /dev/null +++ b/lightrag_webui/src/contexts/context.ts @@ -0,0 +1,12 @@ +import { createContext } from 'react'; +import { TabVisibilityContextType } from './types'; + +// Default context value +const defaultContext: TabVisibilityContextType = { + visibleTabs: {}, + setTabVisibility: () => {}, + isTabVisible: () => false, +}; + +// Create the context +export const TabVisibilityContext = createContext(defaultContext); diff --git a/lightrag_webui/src/contexts/types.ts b/lightrag_webui/src/contexts/types.ts new file mode 100644 index 00000000..051c398c --- /dev/null +++ b/lightrag_webui/src/contexts/types.ts @@ -0,0 +1,5 @@ +export interface TabVisibilityContextType { + visibleTabs: Record; + setTabVisibility: (tabId: string, isVisible: boolean) => void; + isTabVisible: (tabId: string) => boolean; +} diff --git a/lightrag_webui/src/contexts/useTabVisibility.ts b/lightrag_webui/src/contexts/useTabVisibility.ts new file mode 100644 index 00000000..436ce50f --- /dev/null +++ b/lightrag_webui/src/contexts/useTabVisibility.ts @@ -0,0 +1,17 @@ +import { useContext } from 'react'; +import { TabVisibilityContext } from './context'; +import { TabVisibilityContextType } from './types'; + +/** + * Custom hook to access the tab visibility context + * @returns The tab visibility context + */ +export const useTabVisibility = (): TabVisibilityContextType => { + const context = useContext(TabVisibilityContext); + + if (!context) { + throw new Error('useTabVisibility must be used within a TabVisibilityProvider'); + } + + return context; +}; diff --git a/lightrag_webui/src/features/DocumentManager.tsx b/lightrag_webui/src/features/DocumentManager.tsx index b8841fe4..3d4a6717 100644 --- a/lightrag_webui/src/features/DocumentManager.tsx +++ b/lightrag_webui/src/features/DocumentManager.tsx @@ -48,11 +48,11 @@ export default function DocumentManager() { } catch (err) { toast.error(t('documentPanel.documentManager.errors.loadFailed', { error: errorMessage(err) })) } - }, [setDocs]) + }, [setDocs, t]) useEffect(() => { fetchDocuments() - }, []) // eslint-disable-line react-hooks/exhaustive-deps + }, [fetchDocuments, t]) const scanDocuments = useCallback(async () => { try { @@ -61,7 +61,7 @@ export default function DocumentManager() { } catch (err) { toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) })) } - }, []) + }, [t]) useEffect(() => { const interval = setInterval(async () => { @@ -75,7 +75,7 @@ export default function DocumentManager() { } }, 5000) return () => clearInterval(interval) - }, [health, fetchDocuments]) + }, [health, fetchDocuments, t]) return ( From e2b9be2dc6b5ed5dbcc56b5ed6ddd3f6faadf1c2 Mon Sep 17 00:00:00 2001 From: zrguo Date: Thu, 13 Mar 2025 16:52:48 +0800 Subject: [PATCH 07/23] Update lightrag.py --- lightrag/lightrag.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 4f374890..a466e572 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -1114,6 +1114,7 @@ class LightRAG: # Prepare node data node_data: dict[str, str] = { + "entity_id": entity_name, "entity_type": entity_type, "description": description, "source_id": source_id, @@ -1151,6 +1152,7 @@ class LightRAG: await self.chunk_entity_relation_graph.upsert_node( need_insert_id, node_data={ + "entity_id": need_insert_id, "source_id": source_id, "description": "UNKNOWN", "entity_type": "UNKNOWN", @@ -2160,6 +2162,7 @@ class LightRAG: # Prepare node data with defaults if missing node_data = { + "entity_id": entity_name, "entity_type": entity_data.get("entity_type", "UNKNOWN"), "description": entity_data.get("description", ""), "source_id": entity_data.get("source_id", "manual"), From 2170a5d7781473a2526c5061d2f1bf695956a768 Mon Sep 17 00:00:00 2001 From: zrguo Date: Thu, 13 Mar 2025 16:58:03 +0800 Subject: [PATCH 08/23] Update __init__.py --- lightrag/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index 382060f7..89475dca 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.2.5" +__version__ = "1.2.6" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" From 6893e3c4e2b0f387cfc5e1b14c31cd2dc6764bdc Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 17:39:06 +0800 Subject: [PATCH 09/23] Unify two log filters into one and move it to utils --- lightrag/api/gunicorn_config.py | 2 +- lightrag/api/lightrag_server.py | 37 +-------------------------------- lightrag/utils.py | 3 ++- 3 files changed, 4 insertions(+), 38 deletions(-) diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index 0594ceae..23e46807 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -59,7 +59,7 @@ logconfig_dict = { }, "filters": { "path_filter": { - "()": "lightrag.api.lightrag_server.LightragPathFilter", + "()": "lightrag.utils.LightragPathFilter", }, }, "loggers": { diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 9267c370..ca4425e5 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -55,41 +55,6 @@ config = configparser.ConfigParser() config.read("config.ini") -class LightragPathFilter(logging.Filter): - """Filter for lightrag logger to filter out frequent path access logs""" - - def __init__(self): - super().__init__() - # Define paths to be filtered - self.filtered_paths = ["/documents", "/health", "/webui/"] - - def filter(self, record): - try: - # Check if record has the required attributes for an access log - if not hasattr(record, "args") or not isinstance(record.args, tuple): - return True - if len(record.args) < 5: - return True - - # Extract method, path and status from the record args - method = record.args[1] - path = record.args[2] - status = record.args[4] - - # Filter out successful GET requests to filtered paths - if ( - method == "GET" - and (status == 200 or status == 304) - and path in self.filtered_paths - ): - return False - - return True - except Exception: - # In case of any error, let the message through - return True - - def create_app(args): # Setup logging logger.setLevel(args.log_level) @@ -531,7 +496,7 @@ def configure_logging(): }, "filters": { "path_filter": { - "()": "lightrag.api.lightrag_server.LightragPathFilter", + "()": "lightrag.utils.LightragPathFilter", }, }, } diff --git a/lightrag/utils.py b/lightrag/utils.py index b8f00c5d..9f751ec7 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -75,7 +75,8 @@ class LightragPathFilter(logging.Filter): def __init__(self): super().__init__() # Define paths to be filtered - self.filtered_paths = ["/documents", "/health", "/webui/"] + # self.filtered_paths = ["/documents", "/health", "/webui/"] + self.filtered_paths = ["/health", "/webui/"] def filter(self, record): try: From 82c2cae324e1e9231a55ffaa81258e080b70b5a3 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 17:39:06 +0800 Subject: [PATCH 10/23] Unify two log filters into one and move it to utils --- lightrag/api/gunicorn_config.py | 2 +- lightrag/api/lightrag_server.py | 37 +-------------------------------- lightrag/utils.py | 3 ++- 3 files changed, 4 insertions(+), 38 deletions(-) diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index 0594ceae..23e46807 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -59,7 +59,7 @@ logconfig_dict = { }, "filters": { "path_filter": { - "()": "lightrag.api.lightrag_server.LightragPathFilter", + "()": "lightrag.utils.LightragPathFilter", }, }, "loggers": { diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index f4b21e66..24494705 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -55,41 +55,6 @@ config = configparser.ConfigParser() config.read("config.ini") -class LightragPathFilter(logging.Filter): - """Filter for lightrag logger to filter out frequent path access logs""" - - def __init__(self): - super().__init__() - # Define paths to be filtered - self.filtered_paths = ["/documents", "/health", "/webui/"] - - def filter(self, record): - try: - # Check if record has the required attributes for an access log - if not hasattr(record, "args") or not isinstance(record.args, tuple): - return True - if len(record.args) < 5: - return True - - # Extract method, path and status from the record args - method = record.args[1] - path = record.args[2] - status = record.args[4] - - # Filter out successful GET requests to filtered paths - if ( - method == "GET" - and (status == 200 or status == 304) - and path in self.filtered_paths - ): - return False - - return True - except Exception: - # In case of any error, let the message through - return True - - def create_app(args): # Setup logging logger.setLevel(args.log_level) @@ -519,7 +484,7 @@ def configure_logging(): }, "filters": { "path_filter": { - "()": "lightrag.api.lightrag_server.LightragPathFilter", + "()": "lightrag.utils.LightragPathFilter", }, }, } diff --git a/lightrag/utils.py b/lightrag/utils.py index b8f00c5d..9f751ec7 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -75,7 +75,8 @@ class LightragPathFilter(logging.Filter): def __init__(self): super().__init__() # Define paths to be filtered - self.filtered_paths = ["/documents", "/health", "/webui/"] + # self.filtered_paths = ["/documents", "/health", "/webui/"] + self.filtered_paths = ["/health", "/webui/"] def filter(self, record): try: From 3400d3302a0cdb2e238208efc55a1eae3df001c7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 17:45:56 +0800 Subject: [PATCH 11/23] Added "/documents" to filtered paths --- lightrag/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 9f751ec7..362e5531 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -75,8 +75,8 @@ class LightragPathFilter(logging.Filter): def __init__(self): super().__init__() # Define paths to be filtered - # self.filtered_paths = ["/documents", "/health", "/webui/"] - self.filtered_paths = ["/health", "/webui/"] + self.filtered_paths = ["/documents", "/health", "/webui/"] + # self.filtered_paths = ["/health", "/webui/"] def filter(self, record): try: From e30162e50a6c06e888ef66e40111367fda131a6c Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Mar 2025 19:50:37 +0800 Subject: [PATCH 12/23] Minimized API request between Tab view change --- lightrag/utils.py | 4 +- lightrag_webui/src/App.tsx | 2 +- .../src/components/graph/GraphLabels.tsx | 33 ++- .../src/components/ui/TabContent.tsx | 8 +- .../src/contexts/TabVisibilityProvider.tsx | 19 +- lightrag_webui/src/features/ApiSite.tsx | 35 ++- .../src/features/DocumentManager.tsx | 25 +- lightrag_webui/src/features/GraphViewer.tsx | 35 +++ lightrag_webui/src/hooks/useLightragGraph.tsx | 220 +++++++++++------- lightrag_webui/src/stores/graph.ts | 28 ++- 10 files changed, 304 insertions(+), 105 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 9f751ec7..362e5531 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -75,8 +75,8 @@ class LightragPathFilter(logging.Filter): def __init__(self): super().__init__() # Define paths to be filtered - # self.filtered_paths = ["/documents", "/health", "/webui/"] - self.filtered_paths = ["/health", "/webui/"] + self.filtered_paths = ["/documents", "/health", "/webui/"] + # self.filtered_paths = ["/health", "/webui/"] def filter(self, record): try: diff --git a/lightrag_webui/src/App.tsx b/lightrag_webui/src/App.tsx index bb4a84cb..b7d66b7e 100644 --- a/lightrag_webui/src/App.tsx +++ b/lightrag_webui/src/App.tsx @@ -22,7 +22,7 @@ import { Tabs, TabsContent } from '@/components/ui/Tabs' function App() { const message = useBackendState.use.message() const enableHealthCheck = useSettingsStore.use.enableHealthCheck() - const [currentTab] = useState(() => useSettingsStore.getState().currentTab) + const currentTab = useSettingsStore.use.currentTab() const [apiKeyInvalid, setApiKeyInvalid] = useState(false) // Health check diff --git a/lightrag_webui/src/components/graph/GraphLabels.tsx b/lightrag_webui/src/components/graph/GraphLabels.tsx index 1a1e428b..243c26cc 100644 --- a/lightrag_webui/src/components/graph/GraphLabels.tsx +++ b/lightrag_webui/src/components/graph/GraphLabels.tsx @@ -1,4 +1,4 @@ -import { useCallback } from 'react' +import { useCallback, useEffect, useRef } from 'react' import { AsyncSelect } from '@/components/ui/AsyncSelect' import { useSettingsStore } from '@/stores/settings' import { useGraphStore } from '@/stores/graph' @@ -10,6 +10,37 @@ const GraphLabels = () => { const { t } = useTranslation() const label = useSettingsStore.use.queryLabel() const allDatabaseLabels = useGraphStore.use.allDatabaseLabels() + const labelsLoadedRef = useRef(false) + + // Track if a fetch is in progress to prevent multiple simultaneous fetches + const fetchInProgressRef = useRef(false) + + // Fetch labels once on component mount, using global flag to prevent duplicates + useEffect(() => { + // Check if we've already attempted to fetch labels in this session + const labelsFetchAttempted = useGraphStore.getState().labelsFetchAttempted + + // Only fetch if we haven't attempted in this session and no fetch is in progress + if (!labelsFetchAttempted && !fetchInProgressRef.current) { + fetchInProgressRef.current = true + // Set global flag to indicate we've attempted to fetch in this session + useGraphStore.getState().setLabelsFetchAttempted(true) + + console.log('Fetching graph labels (once per session)...') + + useGraphStore.getState().fetchAllDatabaseLabels() + .then(() => { + labelsLoadedRef.current = true + fetchInProgressRef.current = false + }) + .catch((error) => { + console.error('Failed to fetch labels:', error) + fetchInProgressRef.current = false + // Reset global flag to allow retry + useGraphStore.getState().setLabelsFetchAttempted(false) + }) + } + }, []) // Empty dependency array ensures this only runs once on mount const getSearchEngine = useCallback(() => { // Create search engine diff --git a/lightrag_webui/src/components/ui/TabContent.tsx b/lightrag_webui/src/components/ui/TabContent.tsx index f3c0b80f..2d14d849 100644 --- a/lightrag_webui/src/components/ui/TabContent.tsx +++ b/lightrag_webui/src/components/ui/TabContent.tsx @@ -25,12 +25,10 @@ const TabContent: React.FC = ({ tabId, children, className = '' }; }, [tabId, setTabVisibility]); - if (!isVisible) { - return null; - } - + // Use CSS to hide content instead of not rendering it + // This prevents components from unmounting when tabs are switched return ( -
+
{children}
); diff --git a/lightrag_webui/src/contexts/TabVisibilityProvider.tsx b/lightrag_webui/src/contexts/TabVisibilityProvider.tsx index 73be2f64..c4659906 100644 --- a/lightrag_webui/src/contexts/TabVisibilityProvider.tsx +++ b/lightrag_webui/src/contexts/TabVisibilityProvider.tsx @@ -1,6 +1,7 @@ -import React, { useState, useMemo } from 'react'; +import React, { useState, useEffect, useMemo } from 'react'; import { TabVisibilityContext } from './context'; import { TabVisibilityContextType } from './types'; +import { useSettingsStore } from '@/stores/settings'; interface TabVisibilityProviderProps { children: React.ReactNode; @@ -11,7 +12,21 @@ interface TabVisibilityProviderProps { * Manages the visibility state of tabs throughout the application */ export const TabVisibilityProvider: React.FC = ({ children }) => { - const [visibleTabs, setVisibleTabs] = useState>({}); + // Get current tab from settings store + const currentTab = useSettingsStore.use.currentTab(); + + // Initialize visibility state with current tab as visible + const [visibleTabs, setVisibleTabs] = useState>(() => ({ + [currentTab]: true + })); + + // Update visibility when current tab changes + useEffect(() => { + setVisibleTabs((prev) => ({ + ...prev, + [currentTab]: true + })); + }, [currentTab]); // Create the context value with memoization to prevent unnecessary re-renders const contextValue = useMemo( diff --git a/lightrag_webui/src/features/ApiSite.tsx b/lightrag_webui/src/features/ApiSite.tsx index fa9e263f..7adf9240 100644 --- a/lightrag_webui/src/features/ApiSite.tsx +++ b/lightrag_webui/src/features/ApiSite.tsx @@ -1,5 +1,38 @@ +import { useState, useEffect } from 'react' +import { useTabVisibility } from '@/contexts/useTabVisibility' import { backendBaseUrl } from '@/lib/constants' export default function ApiSite() { - return