From 990b684a8564acd7e3b841bc8cc1587fc9d251fc Mon Sep 17 00:00:00 2001 From: zrguo Date: Mon, 6 Jan 2025 15:27:31 +0800 Subject: [PATCH 01/21] Update lightrag.py --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 05de8d9f..cbe49da2 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -177,7 +177,7 @@ class LightRAG: enable_llm_cache: bool = True # Sometimes there are some reason the LLM failed at Extracting Entities, and we want to continue without LLM cost, we can use this flag - enable_llm_cache_for_entity_extract: bool = False + enable_llm_cache_for_entity_extract: bool = True # extension addon_params: dict = field(default_factory=dict) From c88bb4fd536273d00dd34d2c30ff938fc6e52340 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:34:50 +0800 Subject: [PATCH 02/21] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index c65e8258..3152fbcd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 Gustavo Ye +Copyright (c) 2025 LarFii Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From e2a4819af9f08f835c13632e783d19be0be34aac Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:37:37 +0800 Subject: [PATCH 03/21] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ed2a7789..4a7e7a28 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,7 @@ if __name__ == "__main__": | **llm\_model\_kwargs** | `dict` | Additional parameters for LLM generation | | | **vector\_db\_storage\_cls\_kwargs** | `dict` | Additional parameters for vector database (currently not used) | | | **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` | -| **enable\_llm\_cache\_for\_entity\_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `FALSE` | +| **enable\_llm\_cache\_for\_entity\_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `TRUE` | | **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"], "insert_batch_size": 10}`: sets example limit, output language, and batch size for document processing | `example_number: all examples, language: English, insert_batch_size: 10` | | **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` | | **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains three parameters:
- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.
- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` | From 916380e5111f783436cdec22849a07533168c653 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:39:44 +0800 Subject: [PATCH 04/21] Update README.md --- README.md | 63 ------------------------------------------------------- 1 file changed, 63 deletions(-) diff --git a/README.md b/README.md index 4a7e7a28..0b474699 100644 --- a/README.md +++ b/README.md @@ -887,69 +887,6 @@ def extract_queries(file_path): ``` -## Code Structure - -```python -. -├── .github/ -│ ├── workflows/ -│ │ └── linting.yaml -├── examples/ -│ ├── batch_eval.py -│ ├── generate_query.py -│ ├── graph_visual_with_html.py -│ ├── graph_visual_with_neo4j.py -│ ├── insert_custom_kg.py -│ ├── lightrag_api_openai_compatible_demo.py -│ ├── lightrag_api_oracle_demo..py -│ ├── lightrag_azure_openai_demo.py -│ ├── lightrag_bedrock_demo.py -│ ├── lightrag_hf_demo.py -│ ├── lightrag_lmdeploy_demo.py -│ ├── lightrag_nvidia_demo.py -│ ├── lightrag_ollama_demo.py -│ ├── lightrag_openai_compatible_demo.py -│ ├── lightrag_openai_demo.py -│ ├── lightrag_oracle_demo.py -│ ├── lightrag_siliconcloud_demo.py -│ └── vram_management_demo.py -├── lightrag/ -│ ├── api/ -│ │ ├── lollms_lightrag_server.py -│ │ ├── ollama_lightrag_server.py -│ │ ├── openai_lightrag_server.py -│ │ ├── azure_openai_lightrag_server.py -│ │ └── requirements.txt -│ ├── kg/ -│ │ ├── __init__.py -│ │ ├── oracle_impl.py -│ │ └── neo4j_impl.py -│ ├── __init__.py -│ ├── base.py -│ ├── lightrag.py -│ ├── llm.py -│ ├── operate.py -│ ├── prompt.py -│ ├── storage.py -│ └── utils.py -├── reproduce/ -│ ├── Step_0.py -│ ├── Step_1_openai_compatible.py -│ ├── Step_1.py -│ ├── Step_2.py -│ ├── Step_3_openai_compatible.py -│ └── Step_3.py -├── .gitignore -├── .pre-commit-config.yaml -├── get_all_edges_nx.py -├── LICENSE -├── README.md -├── requirements.txt -├── setup.py -├── test_neo4j.py -└── test.py -``` - ## Install with API Support LightRAG provides optional API support through FastAPI servers that add RAG capabilities to existing LLM services. You can install LightRAG with API support in two ways: From 39a366a3dc3a02c3d42c3d3ca566bb828bbaf2d6 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Mon, 6 Jan 2025 15:43:05 +0800 Subject: [PATCH 05/21] Update __init__.py --- lightrag/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/__init__.py b/lightrag/__init__.py index cd2ccf04..b8037813 100644 --- a/lightrag/__init__.py +++ b/lightrag/__init__.py @@ -1,5 +1,5 @@ from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam -__version__ = "1.0.9" +__version__ = "1.1.0" __author__ = "Zirui Guo" __url__ = "https://github.com/HKUDS/LightRAG" From 79646fced8612187ec26e013eb34d19c210e4908 Mon Sep 17 00:00:00 2001 From: xYLiuuuuuu Date: Mon, 6 Jan 2025 16:54:53 +0800 Subject: [PATCH 06/21] Fix:Optimized logic for automatic switching modes when keywords do not exist --- lightrag/operate.py | 117 ++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 75 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index f21e41ff..c8e4565c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -522,15 +522,16 @@ async def kg_query( logger.warning("low_level_keywords and high_level_keywords is empty") return PROMPTS["fail_response"] if ll_keywords == [] and query_param.mode in ["local", "hybrid"]: - logger.warning("low_level_keywords is empty") - return PROMPTS["fail_response"] - else: - ll_keywords = ", ".join(ll_keywords) + logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode) + query_param.mode = "global" if hl_keywords == [] and query_param.mode in ["global", "hybrid"]: - logger.warning("high_level_keywords is empty") - return PROMPTS["fail_response"] - else: - hl_keywords = ", ".join(hl_keywords) + logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode) + query_param.mode = "local" + + ll_keywords = ", ".join(ll_keywords) if ll_keywords else "" + hl_keywords = ", ".join(hl_keywords) if hl_keywords else "" + + logger.info("Using %s mode for query processing", query_param.mode) # Build context keywords = [ll_keywords, hl_keywords] @@ -596,78 +597,44 @@ async def _build_query_context( # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", "" # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", "" - ll_kewwords, hl_keywrds = query[0], query[1] - if query_param.mode in ["local", "hybrid"]: - if ll_kewwords == "": - ll_entities_context, ll_relations_context, ll_text_units_context = ( - "", - "", - "", - ) - warnings.warn( - "Low Level context is None. Return empty Low entity/relationship/source" - ) - query_param.mode = "global" - else: - ( - ll_entities_context, - ll_relations_context, - ll_text_units_context, - ) = await _get_node_data( - ll_kewwords, - knowledge_graph_inst, - entities_vdb, - text_chunks_db, - query_param, - ) - if query_param.mode in ["global", "hybrid"]: - if hl_keywrds == "": - hl_entities_context, hl_relations_context, hl_text_units_context = ( - "", - "", - "", - ) - warnings.warn( - "High Level context is None. Return empty High entity/relationship/source" - ) - query_param.mode = "local" - else: - ( - hl_entities_context, - hl_relations_context, - hl_text_units_context, - ) = await _get_edge_data( - hl_keywrds, - knowledge_graph_inst, - relationships_vdb, - text_chunks_db, - query_param, - ) - if ( - hl_entities_context == "" - and hl_relations_context == "" - and hl_text_units_context == "" - ): - logger.warn("No high level context found. Switching to local mode.") - query_param.mode = "local" - if query_param.mode == "hybrid": + ll_keywords, hl_keywords = query[0], query[1] + + if query_param.mode == "local": + entities_context, relations_context, text_units_context = await _get_node_data( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + elif query_param.mode == "global": + entities_context, relations_context, text_units_context = await _get_edge_data( + hl_keywords, + knowledge_graph_inst, + relationships_vdb, + text_chunks_db, + query_param, + ) + else: # hybrid mode + ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data( + ll_keywords, + knowledge_graph_inst, + entities_vdb, + text_chunks_db, + query_param, + ) + hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data( + hl_keywords, + knowledge_graph_inst, + relationships_vdb, + text_chunks_db, + query_param, + ) entities_context, relations_context, text_units_context = combine_contexts( [hl_entities_context, ll_entities_context], [hl_relations_context, ll_relations_context], [hl_text_units_context, ll_text_units_context], ) - elif query_param.mode == "local": - entities_context, relations_context, text_units_context = ( - ll_entities_context, - ll_relations_context, - ll_text_units_context, - ) - elif query_param.mode == "global": - entities_context, relations_context, text_units_context = ( - hl_entities_context, - hl_relations_context, - hl_text_units_context, - ) return f""" -----Entities----- ```csv From e415f88bd41f09bf93b32283e1ede067d59b659d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?= <118165258+HKUDS@users.noreply.github.com> Date: Mon, 6 Jan 2025 23:20:26 +0800 Subject: [PATCH 07/21] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0b474699..24f569d8 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2025.01.06]🎯📢You can now [use PostgreSQL for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-postgres-for-storage). +- [x] [2025.01.06]🎯📢LightRAG Now Supports [PostgreSQL for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-postgres-for-storage). - [x] [2024.12.31]🎯📢LightRAG now supports [deletion by document ID](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). - [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author. From 22e9f1cd8919b18c528dff9977eee7c6bd9f4fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=9C=A8Data=20Intelligence=20Lab=40HKU=E2=9C=A8?= <118165258+HKUDS@users.noreply.github.com> Date: Mon, 6 Jan 2025 23:21:02 +0800 Subject: [PATCH 08/21] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 24f569d8..f66fb3ce 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ This repository hosts the code of LightRAG. The structure of this code is based ## 🎉 News -- [x] [2025.01.06]🎯📢LightRAG Now Supports [PostgreSQL for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-postgres-for-storage). +- [x] [2025.01.06]🎯📢LightRAG now supports [PostgreSQL for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-postgres-for-storage). - [x] [2024.12.31]🎯📢LightRAG now supports [deletion by document ID](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete). - [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise. - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author. From 536d6f2283815fedb2c423010504fb12fc440055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A5=E7=9F=B3=E6=B8=8A?= Date: Tue, 7 Jan 2025 00:28:15 +0800 Subject: [PATCH 09/21] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AD=97=E7=AC=A6?= =?UTF-8?q?=E5=88=86=E5=89=B2=E5=8A=9F=E8=83=BD=EF=BC=8C=E5=9C=A8=E2=80=9C?= =?UTF-8?q?insert=E2=80=9D=E5=87=BD=E6=95=B0=E4=B8=AD=E5=A6=82=E6=9E=9C?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=8F=82=E6=95=B0split=5Fby=5Fcharacter?= =?UTF-8?q?=EF=BC=8C=E5=88=99=E4=BC=9A=E6=8C=89=E7=85=A7split=5Fby=5Fchara?= =?UTF-8?q?cter=E8=BF=9B=E8=A1=8C=E5=AD=97=E7=AC=A6=E5=88=86=E5=89=B2?= =?UTF-8?q?=EF=BC=8C=E6=AD=A4=E6=97=B6=E5=A6=82=E6=9E=9C=E6=AF=8F=E4=B8=AA?= =?UTF-8?q?=E5=88=86=E5=89=B2=E5=90=8E=E7=9A=84chunk=E7=9A=84tokens?= =?UTF-8?q?=E5=A4=A7=E4=BA=8Emax=5Ftoken=5Fsize=EF=BC=8C=E5=88=99=E4=BC=9A?= =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E6=8C=89token=5Fsize=E5=88=86=E5=89=B2?= =?UTF-8?q?=EF=BC=88todo=EF=BC=9A=E8=80=83=E8=99=91=E5=AD=97=E7=AC=A6?= =?UTF-8?q?=E5=88=86=E5=89=B2=E5=90=8E=E8=BF=87=E7=9F=AD=E7=9A=84chunk?= =?UTF-8?q?=E5=A4=84=E7=90=86=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightrag/lightrag.py | 41 ++++--- lightrag/operate.py | 276 +++++++++++++++++++++++-------------------- 2 files changed, 171 insertions(+), 146 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index cbe49da2..47d64ac0 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -45,6 +45,7 @@ from .storage import ( from .prompt import GRAPH_FIELD_SEP + # future KG integrations # from .kg.ArangoDB_impl import ( @@ -167,7 +168,7 @@ class LightRAG: # LLM llm_model_func: callable = gpt_4o_mini_complete # hf_model_complete# - llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" #'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' + llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" # 'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it' llm_model_max_token_size: int = 32768 llm_model_max_async: int = 16 llm_model_kwargs: dict = field(default_factory=dict) @@ -267,7 +268,7 @@ class LightRAG: self.llm_model_func, hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -313,15 +314,16 @@ class LightRAG: "JsonDocStatusStorage": JsonDocStatusStorage, } - def insert(self, string_or_strings): + def insert(self, string_or_strings, split_by_character=None): loop = always_get_an_event_loop() - return loop.run_until_complete(self.ainsert(string_or_strings)) + return loop.run_until_complete(self.ainsert(string_or_strings, split_by_character)) - async def ainsert(self, string_or_strings): + async def ainsert(self, string_or_strings, split_by_character): """Insert documents with checkpoint support Args: string_or_strings: Single document string or list of document strings + split_by_character: if split_by_character is not None, split the string by character """ if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] @@ -355,10 +357,10 @@ class LightRAG: # Process documents in batches batch_size = self.addon_params.get("insert_batch_size", 10) for i in range(0, len(new_docs), batch_size): - batch_docs = dict(list(new_docs.items())[i : i + batch_size]) + batch_docs = dict(list(new_docs.items())[i: i + batch_size]) for doc_id, doc in tqdm_async( - batch_docs.items(), desc=f"Processing batch {i//batch_size + 1}" + batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" ): try: # Update status to processing @@ -379,6 +381,7 @@ class LightRAG: } for dp in chunking_by_token_size( doc["content"], + split_by_character=split_by_character, overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, tiktoken_model=self.tiktoken_model_name, @@ -545,7 +548,7 @@ class LightRAG: # Check if nodes exist in the knowledge graph for need_insert_id in [src_id, tgt_id]: if not ( - await self.chunk_entity_relation_graph.has_node(need_insert_id) + await self.chunk_entity_relation_graph.has_node(need_insert_id) ): await self.chunk_entity_relation_graph.upsert_node( need_insert_id, @@ -594,9 +597,9 @@ class LightRAG: "src_id": dp["src_id"], "tgt_id": dp["tgt_id"], "content": dp["keywords"] - + dp["src_id"] - + dp["tgt_id"] - + dp["description"], + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], } for dp in all_relationships_data } @@ -621,7 +624,7 @@ class LightRAG: asdict(self), hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -637,7 +640,7 @@ class LightRAG: asdict(self), hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -656,7 +659,7 @@ class LightRAG: asdict(self), hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -897,7 +900,7 @@ class LightRAG: dp for dp in self.entities_vdb.client_storage["data"] if chunk_id - in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if entities_with_chunk: logger.error( @@ -909,7 +912,7 @@ class LightRAG: dp for dp in self.relationships_vdb.client_storage["data"] if chunk_id - in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if relations_with_chunk: logger.error( @@ -926,7 +929,7 @@ class LightRAG: return asyncio.run(self.adelete_by_doc_id(doc_id)) async def get_entity_info( - self, entity_name: str, include_vector_data: bool = False + self, entity_name: str, include_vector_data: bool = False ): """Get detailed information of an entity @@ -977,7 +980,7 @@ class LightRAG: tracemalloc.stop() async def get_relation_info( - self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False ): """Get detailed information of a relationship @@ -1019,7 +1022,7 @@ class LightRAG: return result def get_relation_info_sync( - self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False ): """Synchronous version of getting relationship information diff --git a/lightrag/operate.py b/lightrag/operate.py index b2c4d215..e8f0df65 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -34,30 +34,52 @@ import time def chunking_by_token_size( - content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" + content: str, split_by_character=None, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" ): tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results = [] - for index, start in enumerate( - range(0, len(tokens), max_token_size - overlap_token_size) - ): - chunk_content = decode_tokens_by_tiktoken( - tokens[start : start + max_token_size], model_name=tiktoken_model - ) - results.append( - { - "tokens": min(max_token_size, len(tokens) - start), - "content": chunk_content.strip(), - "chunk_order_index": index, - } - ) + if split_by_character: + raw_chunks = content.split(split_by_character) + new_chunks = [] + for chunk in raw_chunks: + _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) + if len(_tokens) > max_token_size: + for start in range(0, len(_tokens), max_token_size - overlap_token_size): + chunk_content = decode_tokens_by_tiktoken( + _tokens[start: start + max_token_size], model_name=tiktoken_model + ) + new_chunks.append((min(max_token_size, len(_tokens) - start), chunk_content)) + else: + new_chunks.append((len(_tokens), chunk)) + for index, (_len, chunk) in enumerate(new_chunks): + results.append( + { + "tokens": _len, + "content": chunk.strip(), + "chunk_order_index": index, + } + ) + else: + for index, start in enumerate( + range(0, len(tokens), max_token_size - overlap_token_size) + ): + chunk_content = decode_tokens_by_tiktoken( + tokens[start: start + max_token_size], model_name=tiktoken_model + ) + results.append( + { + "tokens": min(max_token_size, len(tokens) - start), + "content": chunk_content.strip(), + "chunk_order_index": index, + } + ) return results async def _handle_entity_relation_summary( - entity_or_relation_name: str, - description: str, - global_config: dict, + entity_or_relation_name: str, + description: str, + global_config: dict, ) -> str: use_llm_func: callable = global_config["llm_model_func"] llm_max_tokens = global_config["llm_model_max_token_size"] @@ -86,8 +108,8 @@ async def _handle_entity_relation_summary( async def _handle_single_entity_extraction( - record_attributes: list[str], - chunk_key: str, + record_attributes: list[str], + chunk_key: str, ): if len(record_attributes) < 4 or record_attributes[0] != '"entity"': return None @@ -107,8 +129,8 @@ async def _handle_single_entity_extraction( async def _handle_single_relationship_extraction( - record_attributes: list[str], - chunk_key: str, + record_attributes: list[str], + chunk_key: str, ): if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': return None @@ -134,10 +156,10 @@ async def _handle_single_relationship_extraction( async def _merge_nodes_then_upsert( - entity_name: str, - nodes_data: list[dict], - knowledge_graph_inst: BaseGraphStorage, - global_config: dict, + entity_name: str, + nodes_data: list[dict], + knowledge_graph_inst: BaseGraphStorage, + global_config: dict, ): already_entity_types = [] already_source_ids = [] @@ -181,11 +203,11 @@ async def _merge_nodes_then_upsert( async def _merge_edges_then_upsert( - src_id: str, - tgt_id: str, - edges_data: list[dict], - knowledge_graph_inst: BaseGraphStorage, - global_config: dict, + src_id: str, + tgt_id: str, + edges_data: list[dict], + knowledge_graph_inst: BaseGraphStorage, + global_config: dict, ): already_weights = [] already_source_ids = [] @@ -248,12 +270,12 @@ async def _merge_edges_then_upsert( async def extract_entities( - chunks: dict[str, TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, - entity_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - global_config: dict, - llm_response_cache: BaseKVStorage = None, + chunks: dict[str, TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + global_config: dict, + llm_response_cache: BaseKVStorage = None, ) -> Union[BaseGraphStorage, None]: use_llm_func: callable = global_config["llm_model_func"] entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] @@ -305,13 +327,13 @@ async def extract_entities( already_relations = 0 async def _user_llm_func_with_cache( - input_text: str, history_messages: list[dict[str, str]] = None + input_text: str, history_messages: list[dict[str, str]] = None ) -> str: if enable_llm_cache_for_entity_extract and llm_response_cache: need_to_restore = False if ( - global_config["embedding_cache_config"] - and global_config["embedding_cache_config"]["enabled"] + global_config["embedding_cache_config"] + and global_config["embedding_cache_config"]["enabled"] ): new_config = global_config.copy() new_config["embedding_cache_config"] = None @@ -413,7 +435,7 @@ async def extract_entities( already_relations += len(maybe_edges) now_ticks = PROMPTS["process_tickers"][ already_processed % len(PROMPTS["process_tickers"]) - ] + ] print( f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", end="", @@ -423,10 +445,10 @@ async def extract_entities( results = [] for result in tqdm_async( - asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]), - total=len(ordered_chunks), - desc="Extracting entities from chunks", - unit="chunk", + asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]), + total=len(ordered_chunks), + desc="Extracting entities from chunks", + unit="chunk", ): results.append(await result) @@ -440,32 +462,32 @@ async def extract_entities( logger.info("Inserting entities into storage...") all_entities_data = [] for result in tqdm_async( - asyncio.as_completed( - [ - _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config) - for k, v in maybe_nodes.items() - ] - ), - total=len(maybe_nodes), - desc="Inserting entities", - unit="entity", + asyncio.as_completed( + [ + _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config) + for k, v in maybe_nodes.items() + ] + ), + total=len(maybe_nodes), + desc="Inserting entities", + unit="entity", ): all_entities_data.append(await result) logger.info("Inserting relationships into storage...") all_relationships_data = [] for result in tqdm_async( - asyncio.as_completed( - [ - _merge_edges_then_upsert( - k[0], k[1], v, knowledge_graph_inst, global_config - ) - for k, v in maybe_edges.items() - ] - ), - total=len(maybe_edges), - desc="Inserting relationships", - unit="relationship", + asyncio.as_completed( + [ + _merge_edges_then_upsert( + k[0], k[1], v, knowledge_graph_inst, global_config + ) + for k, v in maybe_edges.items() + ] + ), + total=len(maybe_edges), + desc="Inserting relationships", + unit="relationship", ): all_relationships_data.append(await result) @@ -496,9 +518,9 @@ async def extract_entities( "src_id": dp["src_id"], "tgt_id": dp["tgt_id"], "content": dp["keywords"] - + dp["src_id"] - + dp["tgt_id"] - + dp["description"], + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], "metadata": { "created_at": dp.get("metadata", {}).get("created_at", time.time()) }, @@ -511,14 +533,14 @@ async def extract_entities( async def kg_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, - hashing_kv: BaseKVStorage = None, + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, + hashing_kv: BaseKVStorage = None, ) -> str: # Handle cache use_model_func = global_config["llm_model_func"] @@ -638,12 +660,12 @@ async def kg_query( async def _build_query_context( - query: list, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, + query: list, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, ): # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", "" # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", "" @@ -696,9 +718,9 @@ async def _build_query_context( query_param, ) if ( - hl_entities_context == "" - and hl_relations_context == "" - and hl_text_units_context == "" + hl_entities_context == "" + and hl_relations_context == "" + and hl_text_units_context == "" ): logger.warn("No high level context found. Switching to local mode.") query_param.mode = "local" @@ -737,11 +759,11 @@ async def _build_query_context( async def _get_node_data( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, ): # get similar entities results = await entities_vdb.query(query, top_k=query_param.top_k) @@ -828,10 +850,10 @@ async def _get_node_data( async def _find_most_related_text_unit_from_entities( - node_datas: list[dict], - query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, + node_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, ): text_units = [ split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) @@ -871,8 +893,8 @@ async def _find_most_related_text_unit_from_entities( if this_edges: for e in this_edges: if ( - e[1] in all_one_hop_text_units_lookup - and c_id in all_one_hop_text_units_lookup[e[1]] + e[1] in all_one_hop_text_units_lookup + and c_id in all_one_hop_text_units_lookup[e[1]] ): all_text_units_lookup[c_id]["relation_counts"] += 1 @@ -902,9 +924,9 @@ async def _find_most_related_text_unit_from_entities( async def _find_most_related_edges_from_entities( - node_datas: list[dict], - query_param: QueryParam, - knowledge_graph_inst: BaseGraphStorage, + node_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, ): all_related_edges = await asyncio.gather( *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] @@ -942,11 +964,11 @@ async def _find_most_related_edges_from_entities( async def _get_edge_data( - keywords, - knowledge_graph_inst: BaseGraphStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, + keywords, + knowledge_graph_inst: BaseGraphStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, ): results = await relationships_vdb.query(keywords, top_k=query_param.top_k) @@ -1044,9 +1066,9 @@ async def _get_edge_data( async def _find_most_related_entities_from_relationships( - edge_datas: list[dict], - query_param: QueryParam, - knowledge_graph_inst: BaseGraphStorage, + edge_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, ): entity_names = [] seen = set() @@ -1081,10 +1103,10 @@ async def _find_most_related_entities_from_relationships( async def _find_related_text_unit_from_relationships( - edge_datas: list[dict], - query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, + edge_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, ): text_units = [ split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) @@ -1150,12 +1172,12 @@ def combine_contexts(entities, relationships, sources): async def naive_query( - query, - chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, - hashing_kv: BaseKVStorage = None, + query, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, + hashing_kv: BaseKVStorage = None, ): # Handle cache use_model_func = global_config["llm_model_func"] @@ -1213,7 +1235,7 @@ async def naive_query( if len(response) > len(sys_prompt): response = ( - response[len(sys_prompt) :] + response[len(sys_prompt):] .replace(sys_prompt, "") .replace("user", "") .replace("model", "") @@ -1241,15 +1263,15 @@ async def naive_query( async def mix_kg_vector_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, - hashing_kv: BaseKVStorage = None, + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, + hashing_kv: BaseKVStorage = None, ) -> str: """ Hybrid retrieval implementation combining knowledge graph and vector search. @@ -1274,7 +1296,7 @@ async def mix_kg_vector_query( # Reuse keyword extraction logic from kg_query example_number = global_config["addon_params"].get("example_number", None) if example_number and example_number < len( - PROMPTS["keywords_extraction_examples"] + PROMPTS["keywords_extraction_examples"] ): examples = "\n".join( PROMPTS["keywords_extraction_examples"][: int(example_number)] From 3bbd3ee1b232cf1335617a5f4308651b295061b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A5=E7=9F=B3=E6=B8=8A?= Date: Tue, 7 Jan 2025 13:45:18 +0800 Subject: [PATCH 10/21] =?UTF-8?q?=E5=9C=A8Mac=E7=AB=AFtorch~=3D2.5.1+cu121?= =?UTF-8?q?=E4=BC=9A=E5=AF=BC=E8=87=B4=E6=9C=AC=E5=9C=B0=E5=AE=89=E8=A3=85?= =?UTF-8?q?=E6=97=B6=E6=8A=A5=E9=94=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 79249e7e..dd3c4cf3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,8 @@ tenacity~=9.0.0 # LLM packages tiktoken~=0.8.0 -torch~=2.5.1+cu121 +# torch~=2.5.1+cu121 +torch~=2.5.1 tqdm~=4.67.1 transformers~=4.47.1 xxhash From 290744d77040799c2c238524ad39cb1355c1182f Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 7 Jan 2025 16:04:46 +0800 Subject: [PATCH 11/21] fix requirements.txt --- requirements.txt | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/requirements.txt b/requirements.txt index 79249e7e..e81473ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,38 +1,38 @@ accelerate -aioboto3~=13.3.0 -aiofiles~=24.1.0 -aiohttp~=3.11.11 -asyncpg~=0.30.0 +aioboto3 +aiofiles +aiohttp +asyncpg # database packages graspologic gremlinpython hnswlib nano-vectordb -neo4j~=5.27.0 -networkx~=3.2.1 +neo4j +networkx -numpy~=2.2.0 -ollama~=0.4.4 -openai~=1.58.1 +numpy +ollama +openai oracledb -psycopg-pool~=3.2.4 -psycopg[binary,pool]~=3.2.3 -pydantic~=2.10.4 +psycopg-pool +psycopg[binary,pool] +pydantic pymilvus pymongo pymysql -python-dotenv~=1.0.1 -pyvis~=0.3.2 -setuptools~=70.0.0 +python-dotenv +pyvis +setuptools # lmdeploy[all] -sqlalchemy~=2.0.36 -tenacity~=9.0.0 +sqlalchemy +tenacity # LLM packages -tiktoken~=0.8.0 -torch~=2.5.1+cu121 -tqdm~=4.67.1 -transformers~=4.47.1 -xxhash +tiktoken +torch +tqdm +transformers +xxhash \ No newline at end of file From 9ef4fe667aeb0ac4b303de698fcdef3ae4fb1c20 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 7 Jan 2025 16:18:19 +0800 Subject: [PATCH 12/21] rename --- contributor-readme.MD => contributor-README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename contributor-readme.MD => contributor-README.md (100%) diff --git a/contributor-readme.MD b/contributor-README.md similarity index 100% rename from contributor-readme.MD rename to contributor-README.md From 79d705071027e15a57c54cc64bc07d2dda246498 Mon Sep 17 00:00:00 2001 From: LarFii <834462287@qq.com> Date: Tue, 7 Jan 2025 16:21:54 +0800 Subject: [PATCH 13/21] fix linting errors --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e81473ea..48c25ff8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,4 @@ tiktoken torch tqdm transformers -xxhash \ No newline at end of file +xxhash From 6b19401dc6f0a27597f15990bd86206409feb540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A5=E7=9F=B3=E6=B8=8A?= Date: Tue, 7 Jan 2025 16:26:12 +0800 Subject: [PATCH 14/21] chunk split retry --- lightrag/lightrag.py | 34 +- lightrag/operate.py | 247 ++++++++------- test.ipynb | 740 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 886 insertions(+), 135 deletions(-) create mode 100644 test.ipynb diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 47d64ac0..7496d736 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -268,7 +268,7 @@ class LightRAG: self.llm_model_func, hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -316,7 +316,9 @@ class LightRAG: def insert(self, string_or_strings, split_by_character=None): loop = always_get_an_event_loop() - return loop.run_until_complete(self.ainsert(string_or_strings, split_by_character)) + return loop.run_until_complete( + self.ainsert(string_or_strings, split_by_character) + ) async def ainsert(self, string_or_strings, split_by_character): """Insert documents with checkpoint support @@ -357,10 +359,10 @@ class LightRAG: # Process documents in batches batch_size = self.addon_params.get("insert_batch_size", 10) for i in range(0, len(new_docs), batch_size): - batch_docs = dict(list(new_docs.items())[i: i + batch_size]) + batch_docs = dict(list(new_docs.items())[i : i + batch_size]) for doc_id, doc in tqdm_async( - batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" + batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}" ): try: # Update status to processing @@ -548,7 +550,7 @@ class LightRAG: # Check if nodes exist in the knowledge graph for need_insert_id in [src_id, tgt_id]: if not ( - await self.chunk_entity_relation_graph.has_node(need_insert_id) + await self.chunk_entity_relation_graph.has_node(need_insert_id) ): await self.chunk_entity_relation_graph.upsert_node( need_insert_id, @@ -597,9 +599,9 @@ class LightRAG: "src_id": dp["src_id"], "tgt_id": dp["tgt_id"], "content": dp["keywords"] - + dp["src_id"] - + dp["tgt_id"] - + dp["description"], + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], } for dp in all_relationships_data } @@ -624,7 +626,7 @@ class LightRAG: asdict(self), hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -640,7 +642,7 @@ class LightRAG: asdict(self), hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -659,7 +661,7 @@ class LightRAG: asdict(self), hashing_kv=self.llm_response_cache if self.llm_response_cache - and hasattr(self.llm_response_cache, "global_config") + and hasattr(self.llm_response_cache, "global_config") else self.key_string_value_json_storage_cls( namespace="llm_response_cache", global_config=asdict(self), @@ -900,7 +902,7 @@ class LightRAG: dp for dp in self.entities_vdb.client_storage["data"] if chunk_id - in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if entities_with_chunk: logger.error( @@ -912,7 +914,7 @@ class LightRAG: dp for dp in self.relationships_vdb.client_storage["data"] if chunk_id - in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) + in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP) ] if relations_with_chunk: logger.error( @@ -929,7 +931,7 @@ class LightRAG: return asyncio.run(self.adelete_by_doc_id(doc_id)) async def get_entity_info( - self, entity_name: str, include_vector_data: bool = False + self, entity_name: str, include_vector_data: bool = False ): """Get detailed information of an entity @@ -980,7 +982,7 @@ class LightRAG: tracemalloc.stop() async def get_relation_info( - self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False ): """Get detailed information of a relationship @@ -1022,7 +1024,7 @@ class LightRAG: return result def get_relation_info_sync( - self, src_entity: str, tgt_entity: str, include_vector_data: bool = False + self, src_entity: str, tgt_entity: str, include_vector_data: bool = False ): """Synchronous version of getting relationship information diff --git a/lightrag/operate.py b/lightrag/operate.py index e8f0df65..1128b41c 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -34,7 +34,11 @@ import time def chunking_by_token_size( - content: str, split_by_character=None, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o" + content: str, + split_by_character=None, + overlap_token_size=128, + max_token_size=1024, + tiktoken_model="gpt-4o", ): tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results = [] @@ -44,11 +48,16 @@ def chunking_by_token_size( for chunk in raw_chunks: _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) if len(_tokens) > max_token_size: - for start in range(0, len(_tokens), max_token_size - overlap_token_size): + for start in range( + 0, len(_tokens), max_token_size - overlap_token_size + ): chunk_content = decode_tokens_by_tiktoken( - _tokens[start: start + max_token_size], model_name=tiktoken_model + _tokens[start : start + max_token_size], + model_name=tiktoken_model, + ) + new_chunks.append( + (min(max_token_size, len(_tokens) - start), chunk_content) ) - new_chunks.append((min(max_token_size, len(_tokens) - start), chunk_content)) else: new_chunks.append((len(_tokens), chunk)) for index, (_len, chunk) in enumerate(new_chunks): @@ -61,10 +70,10 @@ def chunking_by_token_size( ) else: for index, start in enumerate( - range(0, len(tokens), max_token_size - overlap_token_size) + range(0, len(tokens), max_token_size - overlap_token_size) ): chunk_content = decode_tokens_by_tiktoken( - tokens[start: start + max_token_size], model_name=tiktoken_model + tokens[start : start + max_token_size], model_name=tiktoken_model ) results.append( { @@ -77,9 +86,9 @@ def chunking_by_token_size( async def _handle_entity_relation_summary( - entity_or_relation_name: str, - description: str, - global_config: dict, + entity_or_relation_name: str, + description: str, + global_config: dict, ) -> str: use_llm_func: callable = global_config["llm_model_func"] llm_max_tokens = global_config["llm_model_max_token_size"] @@ -108,8 +117,8 @@ async def _handle_entity_relation_summary( async def _handle_single_entity_extraction( - record_attributes: list[str], - chunk_key: str, + record_attributes: list[str], + chunk_key: str, ): if len(record_attributes) < 4 or record_attributes[0] != '"entity"': return None @@ -129,8 +138,8 @@ async def _handle_single_entity_extraction( async def _handle_single_relationship_extraction( - record_attributes: list[str], - chunk_key: str, + record_attributes: list[str], + chunk_key: str, ): if len(record_attributes) < 5 or record_attributes[0] != '"relationship"': return None @@ -156,10 +165,10 @@ async def _handle_single_relationship_extraction( async def _merge_nodes_then_upsert( - entity_name: str, - nodes_data: list[dict], - knowledge_graph_inst: BaseGraphStorage, - global_config: dict, + entity_name: str, + nodes_data: list[dict], + knowledge_graph_inst: BaseGraphStorage, + global_config: dict, ): already_entity_types = [] already_source_ids = [] @@ -203,11 +212,11 @@ async def _merge_nodes_then_upsert( async def _merge_edges_then_upsert( - src_id: str, - tgt_id: str, - edges_data: list[dict], - knowledge_graph_inst: BaseGraphStorage, - global_config: dict, + src_id: str, + tgt_id: str, + edges_data: list[dict], + knowledge_graph_inst: BaseGraphStorage, + global_config: dict, ): already_weights = [] already_source_ids = [] @@ -270,12 +279,12 @@ async def _merge_edges_then_upsert( async def extract_entities( - chunks: dict[str, TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, - entity_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - global_config: dict, - llm_response_cache: BaseKVStorage = None, + chunks: dict[str, TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, + entity_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + global_config: dict, + llm_response_cache: BaseKVStorage = None, ) -> Union[BaseGraphStorage, None]: use_llm_func: callable = global_config["llm_model_func"] entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"] @@ -327,13 +336,13 @@ async def extract_entities( already_relations = 0 async def _user_llm_func_with_cache( - input_text: str, history_messages: list[dict[str, str]] = None + input_text: str, history_messages: list[dict[str, str]] = None ) -> str: if enable_llm_cache_for_entity_extract and llm_response_cache: need_to_restore = False if ( - global_config["embedding_cache_config"] - and global_config["embedding_cache_config"]["enabled"] + global_config["embedding_cache_config"] + and global_config["embedding_cache_config"]["enabled"] ): new_config = global_config.copy() new_config["embedding_cache_config"] = None @@ -435,7 +444,7 @@ async def extract_entities( already_relations += len(maybe_edges) now_ticks = PROMPTS["process_tickers"][ already_processed % len(PROMPTS["process_tickers"]) - ] + ] print( f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r", end="", @@ -445,10 +454,10 @@ async def extract_entities( results = [] for result in tqdm_async( - asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]), - total=len(ordered_chunks), - desc="Extracting entities from chunks", - unit="chunk", + asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]), + total=len(ordered_chunks), + desc="Extracting entities from chunks", + unit="chunk", ): results.append(await result) @@ -462,32 +471,32 @@ async def extract_entities( logger.info("Inserting entities into storage...") all_entities_data = [] for result in tqdm_async( - asyncio.as_completed( - [ - _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config) - for k, v in maybe_nodes.items() - ] - ), - total=len(maybe_nodes), - desc="Inserting entities", - unit="entity", + asyncio.as_completed( + [ + _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config) + for k, v in maybe_nodes.items() + ] + ), + total=len(maybe_nodes), + desc="Inserting entities", + unit="entity", ): all_entities_data.append(await result) logger.info("Inserting relationships into storage...") all_relationships_data = [] for result in tqdm_async( - asyncio.as_completed( - [ - _merge_edges_then_upsert( - k[0], k[1], v, knowledge_graph_inst, global_config - ) - for k, v in maybe_edges.items() - ] - ), - total=len(maybe_edges), - desc="Inserting relationships", - unit="relationship", + asyncio.as_completed( + [ + _merge_edges_then_upsert( + k[0], k[1], v, knowledge_graph_inst, global_config + ) + for k, v in maybe_edges.items() + ] + ), + total=len(maybe_edges), + desc="Inserting relationships", + unit="relationship", ): all_relationships_data.append(await result) @@ -518,9 +527,9 @@ async def extract_entities( "src_id": dp["src_id"], "tgt_id": dp["tgt_id"], "content": dp["keywords"] - + dp["src_id"] - + dp["tgt_id"] - + dp["description"], + + dp["src_id"] + + dp["tgt_id"] + + dp["description"], "metadata": { "created_at": dp.get("metadata", {}).get("created_at", time.time()) }, @@ -533,14 +542,14 @@ async def extract_entities( async def kg_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, - hashing_kv: BaseKVStorage = None, + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, + hashing_kv: BaseKVStorage = None, ) -> str: # Handle cache use_model_func = global_config["llm_model_func"] @@ -660,12 +669,12 @@ async def kg_query( async def _build_query_context( - query: list, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, + query: list, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, ): # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", "" # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", "" @@ -718,9 +727,9 @@ async def _build_query_context( query_param, ) if ( - hl_entities_context == "" - and hl_relations_context == "" - and hl_text_units_context == "" + hl_entities_context == "" + and hl_relations_context == "" + and hl_text_units_context == "" ): logger.warn("No high level context found. Switching to local mode.") query_param.mode = "local" @@ -759,11 +768,11 @@ async def _build_query_context( async def _get_node_data( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, ): # get similar entities results = await entities_vdb.query(query, top_k=query_param.top_k) @@ -850,10 +859,10 @@ async def _get_node_data( async def _find_most_related_text_unit_from_entities( - node_datas: list[dict], - query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, + node_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, ): text_units = [ split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) @@ -893,8 +902,8 @@ async def _find_most_related_text_unit_from_entities( if this_edges: for e in this_edges: if ( - e[1] in all_one_hop_text_units_lookup - and c_id in all_one_hop_text_units_lookup[e[1]] + e[1] in all_one_hop_text_units_lookup + and c_id in all_one_hop_text_units_lookup[e[1]] ): all_text_units_lookup[c_id]["relation_counts"] += 1 @@ -924,9 +933,9 @@ async def _find_most_related_text_unit_from_entities( async def _find_most_related_edges_from_entities( - node_datas: list[dict], - query_param: QueryParam, - knowledge_graph_inst: BaseGraphStorage, + node_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, ): all_related_edges = await asyncio.gather( *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas] @@ -964,11 +973,11 @@ async def _find_most_related_edges_from_entities( async def _get_edge_data( - keywords, - knowledge_graph_inst: BaseGraphStorage, - relationships_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, + keywords, + knowledge_graph_inst: BaseGraphStorage, + relationships_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, ): results = await relationships_vdb.query(keywords, top_k=query_param.top_k) @@ -1066,9 +1075,9 @@ async def _get_edge_data( async def _find_most_related_entities_from_relationships( - edge_datas: list[dict], - query_param: QueryParam, - knowledge_graph_inst: BaseGraphStorage, + edge_datas: list[dict], + query_param: QueryParam, + knowledge_graph_inst: BaseGraphStorage, ): entity_names = [] seen = set() @@ -1103,10 +1112,10 @@ async def _find_most_related_entities_from_relationships( async def _find_related_text_unit_from_relationships( - edge_datas: list[dict], - query_param: QueryParam, - text_chunks_db: BaseKVStorage[TextChunkSchema], - knowledge_graph_inst: BaseGraphStorage, + edge_datas: list[dict], + query_param: QueryParam, + text_chunks_db: BaseKVStorage[TextChunkSchema], + knowledge_graph_inst: BaseGraphStorage, ): text_units = [ split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP]) @@ -1172,12 +1181,12 @@ def combine_contexts(entities, relationships, sources): async def naive_query( - query, - chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, - hashing_kv: BaseKVStorage = None, + query, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, + hashing_kv: BaseKVStorage = None, ): # Handle cache use_model_func = global_config["llm_model_func"] @@ -1235,7 +1244,7 @@ async def naive_query( if len(response) > len(sys_prompt): response = ( - response[len(sys_prompt):] + response[len(sys_prompt) :] .replace(sys_prompt, "") .replace("user", "") .replace("model", "") @@ -1263,15 +1272,15 @@ async def naive_query( async def mix_kg_vector_query( - query, - knowledge_graph_inst: BaseGraphStorage, - entities_vdb: BaseVectorStorage, - relationships_vdb: BaseVectorStorage, - chunks_vdb: BaseVectorStorage, - text_chunks_db: BaseKVStorage[TextChunkSchema], - query_param: QueryParam, - global_config: dict, - hashing_kv: BaseKVStorage = None, + query, + knowledge_graph_inst: BaseGraphStorage, + entities_vdb: BaseVectorStorage, + relationships_vdb: BaseVectorStorage, + chunks_vdb: BaseVectorStorage, + text_chunks_db: BaseKVStorage[TextChunkSchema], + query_param: QueryParam, + global_config: dict, + hashing_kv: BaseKVStorage = None, ) -> str: """ Hybrid retrieval implementation combining knowledge graph and vector search. @@ -1296,7 +1305,7 @@ async def mix_kg_vector_query( # Reuse keyword extraction logic from kg_query example_number = global_config["addon_params"].get("example_number", None) if example_number and example_number < len( - PROMPTS["keywords_extraction_examples"] + PROMPTS["keywords_extraction_examples"] ): examples = "\n".join( PROMPTS["keywords_extraction_examples"][: int(example_number)] diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 00000000..2b9253b4 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4b5690db12e34685", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:38:34.174205Z", + "start_time": "2025-01-07T05:38:29.978194Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import logging\n", + "import numpy as np\n", + "from lightrag import LightRAG, QueryParam\n", + "from lightrag.llm import openai_complete_if_cache, openai_embedding\n", + "from lightrag.utils import EmbeddingFunc\n", + "import nest_asyncio" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8c8ee7c061bf9159", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:38:37.440083Z", + "start_time": "2025-01-07T05:38:37.437666Z" + } + }, + "outputs": [], + "source": [ + "nest_asyncio.apply()\n", + "WORKING_DIR = \"../llm_rag/paper_db/R000088_test2\"\n", + "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n", + "if not os.path.exists(WORKING_DIR):\n", + " os.mkdir(WORKING_DIR)\n", + "os.environ[\"doubao_api\"] = \"6b890250-0cf6-4eb1-aa82-9c9d711398a7\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a5009d16e0851dca", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:38:42.594315Z", + "start_time": "2025-01-07T05:38:42.590800Z" + } + }, + "outputs": [], + "source": [ + "async def llm_model_func(\n", + " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n", + ") -> str:\n", + " return await openai_complete_if_cache(\n", + " \"ep-20241218114828-2tlww\",\n", + " prompt,\n", + " system_prompt=system_prompt,\n", + " history_messages=history_messages,\n", + " api_key=os.getenv(\"doubao_api\"),\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " **kwargs,\n", + " )\n", + "\n", + "\n", + "async def embedding_func(texts: list[str]) -> np.ndarray:\n", + " return await openai_embedding(\n", + " texts,\n", + " model=\"ep-20241231173413-pgjmk\",\n", + " api_key=os.getenv(\"doubao_api\"),\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "397fcad24ce4d0ed", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:38:44.016901Z", + "start_time": "2025-01-07T05:38:44.006291Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Logger initialized for working directory: ../llm_rag/paper_db/R000088_test2\n", + "INFO:lightrag:Load KV llm_response_cache with 0 data\n", + "INFO:lightrag:Load KV full_docs with 0 data\n", + "INFO:lightrag:Load KV text_chunks with 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n", + "INFO:lightrag:Loaded document status storage with 0 records\n" + ] + } + ], + "source": [ + "rag = LightRAG(\n", + " working_dir=WORKING_DIR,\n", + " llm_model_func=llm_model_func,\n", + " embedding_func=EmbeddingFunc(\n", + " embedding_dim=4096, max_token_size=8192, func=embedding_func\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1dc3603677f7484d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:38:47.509111Z", + "start_time": "2025-01-07T05:38:47.501997Z" + } + }, + "outputs": [], + "source": [ + "with open(\n", + " \"../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n", + ") as f:\n", + " content = f.read()\n", + "\n", + "\n", + "async def embedding_func(texts: list[str]) -> np.ndarray:\n", + " return await openai_embedding(\n", + " texts,\n", + " model=\"ep-20241231173413-pgjmk\",\n", + " api_key=os.getenv(\"doubao_api\"),\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " )\n", + "\n", + "\n", + "async def get_embedding_dim():\n", + " test_text = [\"This is a test sentence.\"]\n", + " embedding = await embedding_func(test_text)\n", + " embedding_dim = embedding.shape[1]\n", + " return embedding_dim" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6844202606acfbe5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:38:50.666764Z", + "start_time": "2025-01-07T05:38:50.247712Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n" + ] + } + ], + "source": [ + "embedding_dimension = await get_embedding_dim()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d6273839d9681403", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:42:33.085507Z", + "start_time": "2025-01-07T05:38:56.789348Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Processing 1 new unique documents\n", + "Processing batch 1: 0%| | 0/1 [00:00标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7a6491385b050095", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-07T05:43:24.751628Z", + "start_time": "2025-01-07T05:42:50.865679Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:kw_prompt result:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"high_level_keywords\": [\"英文学术研究论文分析\", \"关键信息提取\", \"深入分析\"],\n", + " \"low_level_keywords\": [\"研究队列\", \"队列名称\", \"队列开展国家\", \"性别分布\", \"年龄分布\", \"队列研究时间线\", \"实际参与研究人数\"]\n", + "}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:Local query uses 60 entites, 38 relations, 6 text units\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:Global query uses 72 entites, 60 relations, 4 text units\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<分析>\n", + "- **分析对象来自哪些研究队列及是单独分析还是联合分析**:\n", + " 通过查找论文内容,发现文中提到“This is a combined analysis of data from 2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16)”,明确是对两个队列的数据进行联合分析,队列名称分别为“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”。\n", + "- **队列开展的国家**:\n", + " 文中多次提及研究在挪威进行,如“combined analyses and extended follow-up of 2 vitamin B intervention trials among patients with ischemic heart disease in Norway”,所以确定研究开展的国家是挪威。\n", + "- **队列研究对象的性别分布**:\n", + " 从“Mean (SD) age was 62.3 (11.0) years and 23.5% of participants were women”可知,研究对象包含男性和女性,即全体。\n", + "- **队列收集结束时研究对象年龄分布**:\n", + " 已知“Mean (SD) age was 62.3 (11.0) years”是基线时年龄信息,“Median (interquartile range) duration of extended follow-up through December 31, 2007, was 78 (61 - 90) months”,由于随访的中位时间是78个月(约6.5年),所以可推算队列收集结束时研究对象年龄均值约为62.3 + 6.5 = 68.8岁(标准差仍为11.0年)。\n", + "- **队列研究时间线**:\n", + " 根据“2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16) conducted between 1998 and 2005, and an observational posttrial follow-up through December 31, 2007”可知,队列开始收集信息时间为1998年,结束时间为2007年12月31日。\n", + "- **队列结束时实际参与研究人数**:\n", + " 由“A total of 6837 individuals were included in the combined analyses, of whom 6261 (91.6%) participated in posttrial follow-up”可知,队列结束时实际参与研究人数为6261人。\n", + "\n", + "\n", + "<回答>\n", + "- 分析对象来自“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”两个研究队列,文中是对这两个队列的数据进行联合分析。\n", + "- 队列开展的国家是挪威。\n", + "- 队列研究对象的性别分布为全体。\n", + "- 队列收集结束时,研究对象年龄分布均值约为68.8岁,标准差为11.0年。\n", + "- 队列研究时间线为1998年开始收集信息/建立队列,2007年12月31日结束。\n", + "- 队列结束时实际参与研究人数是6261人。\n" + ] + } + ], + "source": [ + "print(rag.query(prompt1, param=QueryParam(mode=\"hybrid\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef9d06983da47af", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 6c78c96854d9ab563a547546dd8652ed59190bd2 Mon Sep 17 00:00:00 2001 From: zrguo Date: Tue, 7 Jan 2025 22:02:34 +0800 Subject: [PATCH 15/21] fix linting errors --- lightrag/operate.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index 59e9f648..ce7b0a8a 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -4,7 +4,6 @@ import re from tqdm.asyncio import tqdm as tqdm_async from typing import Union from collections import Counter, defaultdict -import warnings from .utils import ( logger, clean_str, @@ -605,10 +604,16 @@ async def kg_query( logger.warning("low_level_keywords and high_level_keywords is empty") return PROMPTS["fail_response"] if ll_keywords == [] and query_param.mode in ["local", "hybrid"]: - logger.warning("low_level_keywords is empty, switching from %s mode to global mode", query_param.mode) + logger.warning( + "low_level_keywords is empty, switching from %s mode to global mode", + query_param.mode, + ) query_param.mode = "global" if hl_keywords == [] and query_param.mode in ["global", "hybrid"]: - logger.warning("high_level_keywords is empty, switching from %s mode to local mode", query_param.mode) + logger.warning( + "high_level_keywords is empty, switching from %s mode to local mode", + query_param.mode, + ) query_param.mode = "local" ll_keywords = ", ".join(ll_keywords) if ll_keywords else "" @@ -699,14 +704,22 @@ async def _build_query_context( query_param, ) else: # hybrid mode - ll_entities_context, ll_relations_context, ll_text_units_context = await _get_node_data( + ( + ll_entities_context, + ll_relations_context, + ll_text_units_context, + ) = await _get_node_data( ll_keywords, knowledge_graph_inst, entities_vdb, text_chunks_db, query_param, ) - hl_entities_context, hl_relations_context, hl_text_units_context = await _get_edge_data( + ( + hl_entities_context, + hl_relations_context, + hl_text_units_context, + ) = await _get_edge_data( hl_keywords, knowledge_graph_inst, relationships_vdb, From a9402513909606c76a2e8d5e040f12ecb8aa4739 Mon Sep 17 00:00:00 2001 From: Gurjot Singh Date: Tue, 7 Jan 2025 20:57:39 +0530 Subject: [PATCH 16/21] Implement custom chunking feature --- lightrag/lightrag.py | 66 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7496d736..2225b2d1 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -458,6 +458,72 @@ class LightRAG: # Ensure all indexes are updated after each document await self._insert_done() + def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): + loop = always_get_an_event_loop() + return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks)) + + async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]): + + update_storage = False + try: + doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-") + new_docs = { + doc_key: {"content": full_text.strip()} + } + + _add_doc_keys = await self.full_docs.filter_keys([doc_key]) + new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} + if not len(new_docs): + logger.warning("This document is already in the storage.") + return + + update_storage = True + logger.info(f"[New Docs] inserting {len(new_docs)} docs") + + inserting_chunks = {} + for chunk_text in text_chunks: + chunk_text_stripped = chunk_text.strip() + chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-") + + inserting_chunks[chunk_key] = { + "content": chunk_text_stripped, + "full_doc_id": doc_key, + } + + _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys())) + inserting_chunks = { + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys + } + if not len(inserting_chunks): + logger.warning("All chunks are already in the storage.") + return + + logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks") + + await self.chunks_vdb.upsert(inserting_chunks) + + logger.info("[Entity Extraction]...") + maybe_new_kg = await extract_entities( + inserting_chunks, + knowledge_graph_inst=self.chunk_entity_relation_graph, + entity_vdb=self.entities_vdb, + relationships_vdb=self.relationships_vdb, + global_config=asdict(self), + ) + + if maybe_new_kg is None: + logger.warning("No new entities and relationships found") + return + else: + self.chunk_entity_relation_graph = maybe_new_kg + + await self.full_docs.upsert(new_docs) + await self.text_chunks.upsert(inserting_chunks) + + finally: + if update_storage: + await self._insert_done() + async def _insert_done(self): tasks = [] for storage_inst in [ From 9e7784ab8a642415432c742d8e891f6173886f66 Mon Sep 17 00:00:00 2001 From: zrguo <49157727+LarFii@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:17:32 +0800 Subject: [PATCH 17/21] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f66fb3ce..6c981d92 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@

- +

From 9565a4663ad8878126f16d667455ca5a22f1d557 Mon Sep 17 00:00:00 2001 From: Gurjot Singh Date: Thu, 9 Jan 2025 00:39:22 +0530 Subject: [PATCH 18/21] Fix trailing whitespace and formatting issues in lightrag.py --- lightrag/lightrag.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 2225b2d1..6af29aa2 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -460,16 +460,15 @@ class LightRAG: def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): loop = always_get_an_event_loop() - return loop.run_until_complete(self.ainsert_custom_chunks(full_text, text_chunks)) + return loop.run_until_complete( + self.ainsert_custom_chunks(full_text, text_chunks) + ) async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]): - update_storage = False try: doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-") - new_docs = { - doc_key: {"content": full_text.strip()} - } + new_docs = {doc_key: {"content": full_text.strip()}} _add_doc_keys = await self.full_docs.filter_keys([doc_key]) new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} @@ -484,13 +483,15 @@ class LightRAG: for chunk_text in text_chunks: chunk_text_stripped = chunk_text.strip() chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-") - + inserting_chunks[chunk_key] = { "content": chunk_text_stripped, "full_doc_id": doc_key, } - _add_chunk_keys = await self.text_chunks.filter_keys(list(inserting_chunks.keys())) + _add_chunk_keys = await self.text_chunks.filter_keys( + list(inserting_chunks.keys()) + ) inserting_chunks = { k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys } From 65c1450c66a769e9134e900a87706f9bc4ab5a97 Mon Sep 17 00:00:00 2001 From: Saifeddine ALOUI Date: Wed, 8 Jan 2025 20:50:22 +0100 Subject: [PATCH 19/21] fixed retro compatibility with ainsert by making split_by_character get a None default value --- lightrag/lightrag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7496d736..362b7275 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -320,7 +320,7 @@ class LightRAG: self.ainsert(string_or_strings, split_by_character) ) - async def ainsert(self, string_or_strings, split_by_character): + async def ainsert(self, string_or_strings, split_by_character=None): """Insert documents with checkpoint support Args: From dd213c95be5c63bc61f399f14612028fd40a4a33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AB=A5=E7=9F=B3=E6=B8=8A?= Date: Thu, 9 Jan 2025 11:55:49 +0800 Subject: [PATCH 20/21] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BB=85=E5=AD=97?= =?UTF-8?q?=E7=AC=A6=E5=88=86=E5=89=B2=E5=8F=82=E6=95=B0=EF=BC=8C=E5=A6=82?= =?UTF-8?q?=E6=9E=9C=E5=BC=80=E5=90=AF=EF=BC=8C=E4=BB=85=E9=87=87=E7=94=A8?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E5=88=86=E5=89=B2=EF=BC=8C=E4=B8=8D=E5=BC=80?= =?UTF-8?q?=E5=90=AF=EF=BC=8C=E5=9C=A8=E5=88=86=E5=89=B2=E5=AE=8C=E4=BB=A5?= =?UTF-8?q?=E5=90=8E=E5=A6=82=E6=9E=9Cchunk=E8=BF=87=E5=A4=A7=EF=BC=8C?= =?UTF-8?q?=E4=BC=9A=E7=BB=A7=E7=BB=AD=E6=A0=B9=E6=8D=AEtoken=20size?= =?UTF-8?q?=E5=88=86=E5=89=B2=EF=BC=8C=E6=9B=B4=E6=96=B0=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/test_split_by_character.ipynb | 1296 ++++++++++++++++++++++++ lightrag/lightrag.py | 16 +- lightrag/operate.py | 34 +- test.ipynb | 740 -------------- 4 files changed, 1328 insertions(+), 758 deletions(-) create mode 100644 examples/test_split_by_character.ipynb delete mode 100644 test.ipynb diff --git a/examples/test_split_by_character.ipynb b/examples/test_split_by_character.ipynb new file mode 100644 index 00000000..e8e08b92 --- /dev/null +++ b/examples/test_split_by_character.ipynb @@ -0,0 +1,1296 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4b5690db12e34685", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:40:58.307102Z", + "start_time": "2025-01-09T03:40:51.935233Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import logging\n", + "import numpy as np\n", + "from lightrag import LightRAG, QueryParam\n", + "from lightrag.llm import openai_complete_if_cache, openai_embedding\n", + "from lightrag.utils import EmbeddingFunc\n", + "import nest_asyncio" + ] + }, + { + "cell_type": "markdown", + "id": "dd17956ec322b361", + "metadata": {}, + "source": "#### split by character" + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8c8ee7c061bf9159", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:13.961167Z", + "start_time": "2025-01-09T03:41:13.958357Z" + } + }, + "outputs": [], + "source": [ + "nest_asyncio.apply()\n", + "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test1\"\n", + "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n", + "if not os.path.exists(WORKING_DIR):\n", + " os.mkdir(WORKING_DIR)\n", + "API = os.environ.get(\"DOUBAO_API_KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a5009d16e0851dca", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:16.862036Z", + "start_time": "2025-01-09T03:41:16.859306Z" + } + }, + "outputs": [], + "source": [ + "async def llm_model_func(\n", + " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n", + ") -> str:\n", + " return await openai_complete_if_cache(\n", + " \"ep-20241218114828-2tlww\",\n", + " prompt,\n", + " system_prompt=system_prompt,\n", + " history_messages=history_messages,\n", + " api_key=API,\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " **kwargs,\n", + " )\n", + "\n", + "\n", + "async def embedding_func(texts: list[str]) -> np.ndarray:\n", + " return await openai_embedding(\n", + " texts,\n", + " model=\"ep-20241231173413-pgjmk\",\n", + " api_key=API,\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "397fcad24ce4d0ed", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:24.950307Z", + "start_time": "2025-01-09T03:41:24.940353Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test1\n", + "INFO:lightrag:Load KV llm_response_cache with 0 data\n", + "INFO:lightrag:Load KV full_docs with 0 data\n", + "INFO:lightrag:Load KV text_chunks with 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_entities.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_relationships.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_chunks.json'} 0 data\n", + "INFO:lightrag:Loaded document status storage with 0 records\n" + ] + } + ], + "source": [ + "rag = LightRAG(\n", + " working_dir=WORKING_DIR,\n", + " llm_model_func=llm_model_func,\n", + " embedding_func=EmbeddingFunc(\n", + " embedding_dim=4096, max_token_size=8192, func=embedding_func\n", + " ),\n", + " chunk_token_size=512,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1dc3603677f7484d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:37.947456Z", + "start_time": "2025-01-09T03:41:37.941901Z" + } + }, + "outputs": [], + "source": [ + "with open(\n", + " \"../../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n", + ") as f:\n", + " content = f.read()\n", + "\n", + "\n", + "async def embedding_func(texts: list[str]) -> np.ndarray:\n", + " return await openai_embedding(\n", + " texts,\n", + " model=\"ep-20241231173413-pgjmk\",\n", + " api_key=API,\n", + " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", + " )\n", + "\n", + "\n", + "async def get_embedding_dim():\n", + " test_text = [\"This is a test sentence.\"]\n", + " embedding = await embedding_func(test_text)\n", + " embedding_dim = embedding.shape[1]\n", + " return embedding_dim" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6844202606acfbe5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:41:39.608541Z", + "start_time": "2025-01-09T03:41:39.165057Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n" + ] + } + ], + "source": [ + "embedding_dimension = await get_embedding_dim()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d6273839d9681403", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:44:34.295345Z", + "start_time": "2025-01-09T03:41:48.324171Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Processing 1 new unique documents\n", + "Processing batch 1: 0%| | 0/1 [00:00标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7a6491385b050095", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:45:40.829111Z", + "start_time": "2025-01-09T03:45:13.530298Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:Local query uses 5 entites, 12 relations, 3 text units\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:lightrag:Global query uses 8 entites, 5 relations, 4 text units\n", + "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<分析>\n", + "1. **该文献主要研究的问题是什么?**\n", + " - 思考过程:通过浏览论文内容,查找作者明确阐述研究目的的部分。文中多处提及“Our study was performed to explore whether folic acid treatment was associated with cancer outcomes and all-cause mortality after extended follow-up”,表明作者旨在探究叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后。\n", + "2. **该文献采用什么方法进行分析?**\n", + " - 思考过程:寻找描述研究方法和数据分析过程的段落。文中提到“Survival curves were constructed using the Kaplan-Meier method and differences in survival between groups were analyzed using the log-rank test. Estimates of hazard ratios (HRs) with 95% CIs were obtained by using Cox proportional hazards regression models stratified by trial”,可以看出作者使用了Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异以及Cox比例风险回归模型估计风险比等方法。\n", + "3. **该文献的主要结论是什么?**\n", + " - 思考过程:定位到论文中总结结论的部分,如“Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”,可知作者得出叶酸加维生素$\\mathsf{B}_{12}$治疗与癌症结局和全因死亡率增加有关的结论。\n", + "<回答>\n", + "1. 该文献主要研究的问题是:叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后,叶酸治疗是否与癌症结局和全因死亡率相关。\n", + "2. 该文献采用的分析方法包括:使用Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异、Cox比例风险回归模型估计风险比等。\n", + "3. 该文献的主要结论是:在挪威没有叶酸强化食品的情况下,叶酸加维生素$\\mathsf{B}_{12}$治疗与缺血性心脏病患者的癌症结局和全因死亡率增加有关。\n", + "\n", + "**参考文献**\n", + "- [VD] In2Norwegianhomocysteine-lowering trialsamongpatientswithischemicheart disease, there was a statistically nonsignificantincreaseincancerincidenceinthe groupsassignedtofolicacidtreatment.15,16 Our study was performed to explore whetherfolicacidtreatmentwasassociatedwithcanceroutcomesandall-cause mortality after extended follow-up.\n", + "- [VD] Survivalcurveswereconstructedusing theKaplan-Meiermethodanddifferences insurvivalbetweengroupswereanalyzed usingthelog-ranktest.Estimatesofhazard ratios (HRs) with $95\\%$ CIs were obtainedbyusingCoxproportionalhazards regressionmodelsstratifiedbytrial.\n", + "- [VD] Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods.\n" + ] + } + ], + "source": [ + "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n", + "print(resp)" + ] + }, + { + "cell_type": "markdown", + "id": "4e5bfad24cb721a8", + "metadata": {}, + "source": "#### split by character only" + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "44e2992dc95f8ce0", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:47:40.988796Z", + "start_time": "2025-01-09T03:47:40.982648Z" + } + }, + "outputs": [], + "source": [ + "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test2\"\n", + "if not os.path.exists(WORKING_DIR):\n", + " os.mkdir(WORKING_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "62c63385d2d973d5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-01-09T03:51:39.951329Z", + "start_time": "2025-01-09T03:49:15.218976Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test2\n", + "INFO:lightrag:Load KV llm_response_cache with 0 data\n", + "INFO:lightrag:Load KV full_docs with 0 data\n", + "INFO:lightrag:Load KV text_chunks with 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n", + "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n", + "INFO:lightrag:Loaded document status storage with 0 records\n", + "INFO:lightrag:Processing 1 new unique documents\n", + "Processing batch 1: 0%| | 0/1 [00:00\n", + "- **该文献主要研究的问题是什么?**\n", + " - **思考过程**:通过浏览论文的标题、摘要、引言等部分,寻找关于研究目的和问题的描述。论文标题为“Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12”,摘要中的“Objective”部分明确指出研究目的是“To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials”。因此,可以确定该文献主要研究的问题是评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n", + "- **该文献采用什么方法进行分析?**\n", + " - **思考过程**:在论文的“METHODS”部分详细描述了研究方法。文中提到这是一个对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行了观察性的试验后随访。具体包括对参与者进行分组干预(不同剂量的叶酸、维生素B12、维生素B6或安慰剂),收集临床信息和血样,分析循环B族维生素、同型半胱氨酸和可替宁等指标,并进行基因分型等,还涉及到多种统计分析方法,如计算预期癌症发生率、构建生存曲线、进行Cox比例风险回归模型分析等。\n", + "- **该文献的主要结论是什么?**\n", + " - **思考过程**:在论文的“Results”和“Conclusion”部分寻找主要结论。研究结果表明,在治疗期间,接受叶酸加维生素B12治疗的参与者血清叶酸浓度显著增加,且在后续随访中,该组癌症发病率、癌症死亡率和全因死亡率均有所上升,主要是肺癌发病率增加,而维生素B6治疗未显示出显著影响。结论部分明确指出“Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”。\n", + "\n", + "\n", + "<回答>\n", + "- **主要研究问题**:评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n", + "- **研究方法**:采用对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行观察性的试验后随访,涉及分组干预、多种指标检测以及多种统计分析方法。\n", + "- **主要结论**:在挪威(食品中未添加叶酸),对于缺血性心脏病患者,叶酸加维生素B12治疗与癌症结局和全因死亡率的增加有关,而维生素B6治疗未显示出显著影响。\n", + "\n", + "**参考文献**\n", + "- [VD] Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12\n", + "- [VD] METHODS Study Design, Participants, and Study Intervention\n", + "- [VD] RESULTS\n", + "- [VD] Conclusion\n", + "- [VD] Objective To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials.\n" + ] + } + ], + "source": [ + "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n", + "print(resp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba6fa79a2550d10", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 7496d736..b94ff821 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -314,18 +314,25 @@ class LightRAG: "JsonDocStatusStorage": JsonDocStatusStorage, } - def insert(self, string_or_strings, split_by_character=None): + def insert( + self, string_or_strings, split_by_character=None, split_by_character_only=False + ): loop = always_get_an_event_loop() return loop.run_until_complete( - self.ainsert(string_or_strings, split_by_character) + self.ainsert(string_or_strings, split_by_character, split_by_character_only) ) - async def ainsert(self, string_or_strings, split_by_character): + async def ainsert( + self, string_or_strings, split_by_character, split_by_character_only + ): """Insert documents with checkpoint support Args: string_or_strings: Single document string or list of document strings - split_by_character: if split_by_character is not None, split the string by character + split_by_character: if split_by_character is not None, split the string by character, if chunk longer than + chunk_size, split the sub chunk by token size. + split_by_character_only: if split_by_character_only is True, split the string by character only, when + split_by_character is None, this parameter is ignored. """ if isinstance(string_or_strings, str): string_or_strings = [string_or_strings] @@ -384,6 +391,7 @@ class LightRAG: for dp in chunking_by_token_size( doc["content"], split_by_character=split_by_character, + split_by_character_only=split_by_character_only, overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, tiktoken_model=self.tiktoken_model_name, diff --git a/lightrag/operate.py b/lightrag/operate.py index 1128b41c..58ae3703 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -36,6 +36,7 @@ import time def chunking_by_token_size( content: str, split_by_character=None, + split_by_character_only=False, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o", @@ -45,21 +46,26 @@ def chunking_by_token_size( if split_by_character: raw_chunks = content.split(split_by_character) new_chunks = [] - for chunk in raw_chunks: - _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) - if len(_tokens) > max_token_size: - for start in range( - 0, len(_tokens), max_token_size - overlap_token_size - ): - chunk_content = decode_tokens_by_tiktoken( - _tokens[start : start + max_token_size], - model_name=tiktoken_model, - ) - new_chunks.append( - (min(max_token_size, len(_tokens) - start), chunk_content) - ) - else: + if split_by_character_only: + for chunk in raw_chunks: + _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) new_chunks.append((len(_tokens), chunk)) + else: + for chunk in raw_chunks: + _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model) + if len(_tokens) > max_token_size: + for start in range( + 0, len(_tokens), max_token_size - overlap_token_size + ): + chunk_content = decode_tokens_by_tiktoken( + _tokens[start : start + max_token_size], + model_name=tiktoken_model, + ) + new_chunks.append( + (min(max_token_size, len(_tokens) - start), chunk_content) + ) + else: + new_chunks.append((len(_tokens), chunk)) for index, (_len, chunk) in enumerate(new_chunks): results.append( { diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 2b9253b4..00000000 --- a/test.ipynb +++ /dev/null @@ -1,740 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "4b5690db12e34685", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:34.174205Z", - "start_time": "2025-01-07T05:38:29.978194Z" - } - }, - "outputs": [], - "source": [ - "import os\n", - "import logging\n", - "import numpy as np\n", - "from lightrag import LightRAG, QueryParam\n", - "from lightrag.llm import openai_complete_if_cache, openai_embedding\n", - "from lightrag.utils import EmbeddingFunc\n", - "import nest_asyncio" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8c8ee7c061bf9159", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:37.440083Z", - "start_time": "2025-01-07T05:38:37.437666Z" - } - }, - "outputs": [], - "source": [ - "nest_asyncio.apply()\n", - "WORKING_DIR = \"../llm_rag/paper_db/R000088_test2\"\n", - "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n", - "if not os.path.exists(WORKING_DIR):\n", - " os.mkdir(WORKING_DIR)\n", - "os.environ[\"doubao_api\"] = \"6b890250-0cf6-4eb1-aa82-9c9d711398a7\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a5009d16e0851dca", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:42.594315Z", - "start_time": "2025-01-07T05:38:42.590800Z" - } - }, - "outputs": [], - "source": [ - "async def llm_model_func(\n", - " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n", - ") -> str:\n", - " return await openai_complete_if_cache(\n", - " \"ep-20241218114828-2tlww\",\n", - " prompt,\n", - " system_prompt=system_prompt,\n", - " history_messages=history_messages,\n", - " api_key=os.getenv(\"doubao_api\"),\n", - " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", - " **kwargs,\n", - " )\n", - "\n", - "\n", - "async def embedding_func(texts: list[str]) -> np.ndarray:\n", - " return await openai_embedding(\n", - " texts,\n", - " model=\"ep-20241231173413-pgjmk\",\n", - " api_key=os.getenv(\"doubao_api\"),\n", - " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "397fcad24ce4d0ed", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:44.016901Z", - "start_time": "2025-01-07T05:38:44.006291Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:lightrag:Logger initialized for working directory: ../llm_rag/paper_db/R000088_test2\n", - "INFO:lightrag:Load KV llm_response_cache with 0 data\n", - "INFO:lightrag:Load KV full_docs with 0 data\n", - "INFO:lightrag:Load KV text_chunks with 0 data\n", - "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n", - "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n", - "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n", - "INFO:lightrag:Loaded document status storage with 0 records\n" - ] - } - ], - "source": [ - "rag = LightRAG(\n", - " working_dir=WORKING_DIR,\n", - " llm_model_func=llm_model_func,\n", - " embedding_func=EmbeddingFunc(\n", - " embedding_dim=4096, max_token_size=8192, func=embedding_func\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1dc3603677f7484d", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:47.509111Z", - "start_time": "2025-01-07T05:38:47.501997Z" - } - }, - "outputs": [], - "source": [ - "with open(\n", - " \"../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n", - ") as f:\n", - " content = f.read()\n", - "\n", - "\n", - "async def embedding_func(texts: list[str]) -> np.ndarray:\n", - " return await openai_embedding(\n", - " texts,\n", - " model=\"ep-20241231173413-pgjmk\",\n", - " api_key=os.getenv(\"doubao_api\"),\n", - " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", - " )\n", - "\n", - "\n", - "async def get_embedding_dim():\n", - " test_text = [\"This is a test sentence.\"]\n", - " embedding = await embedding_func(test_text)\n", - " embedding_dim = embedding.shape[1]\n", - " return embedding_dim" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6844202606acfbe5", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:38:50.666764Z", - "start_time": "2025-01-07T05:38:50.247712Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n" - ] - } - ], - "source": [ - "embedding_dimension = await get_embedding_dim()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d6273839d9681403", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:42:33.085507Z", - "start_time": "2025-01-07T05:38:56.789348Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:lightrag:Processing 1 new unique documents\n", - "Processing batch 1: 0%| | 0/1 [00:00标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "7a6491385b050095", - "metadata": { - "ExecuteTime": { - "end_time": "2025-01-07T05:43:24.751628Z", - "start_time": "2025-01-07T05:42:50.865679Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n", - "INFO:lightrag:kw_prompt result:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"high_level_keywords\": [\"英文学术研究论文分析\", \"关键信息提取\", \"深入分析\"],\n", - " \"low_level_keywords\": [\"研究队列\", \"队列名称\", \"队列开展国家\", \"性别分布\", \"年龄分布\", \"队列研究时间线\", \"实际参与研究人数\"]\n", - "}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:lightrag:Local query uses 60 entites, 38 relations, 6 text units\n", - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:lightrag:Global query uses 72 entites, 60 relations, 4 text units\n", - "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "<分析>\n", - "- **分析对象来自哪些研究队列及是单独分析还是联合分析**:\n", - " 通过查找论文内容,发现文中提到“This is a combined analysis of data from 2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16)”,明确是对两个队列的数据进行联合分析,队列名称分别为“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”。\n", - "- **队列开展的国家**:\n", - " 文中多次提及研究在挪威进行,如“combined analyses and extended follow-up of 2 vitamin B intervention trials among patients with ischemic heart disease in Norway”,所以确定研究开展的国家是挪威。\n", - "- **队列研究对象的性别分布**:\n", - " 从“Mean (SD) age was 62.3 (11.0) years and 23.5% of participants were women”可知,研究对象包含男性和女性,即全体。\n", - "- **队列收集结束时研究对象年龄分布**:\n", - " 已知“Mean (SD) age was 62.3 (11.0) years”是基线时年龄信息,“Median (interquartile range) duration of extended follow-up through December 31, 2007, was 78 (61 - 90) months”,由于随访的中位时间是78个月(约6.5年),所以可推算队列收集结束时研究对象年龄均值约为62.3 + 6.5 = 68.8岁(标准差仍为11.0年)。\n", - "- **队列研究时间线**:\n", - " 根据“2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16) conducted between 1998 and 2005, and an observational posttrial follow-up through December 31, 2007”可知,队列开始收集信息时间为1998年,结束时间为2007年12月31日。\n", - "- **队列结束时实际参与研究人数**:\n", - " 由“A total of 6837 individuals were included in the combined analyses, of whom 6261 (91.6%) participated in posttrial follow-up”可知,队列结束时实际参与研究人数为6261人。\n", - "\n", - "\n", - "<回答>\n", - "- 分析对象来自“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”两个研究队列,文中是对这两个队列的数据进行联合分析。\n", - "- 队列开展的国家是挪威。\n", - "- 队列研究对象的性别分布为全体。\n", - "- 队列收集结束时,研究对象年龄分布均值约为68.8岁,标准差为11.0年。\n", - "- 队列研究时间线为1998年开始收集信息/建立队列,2007年12月31日结束。\n", - "- 队列结束时实际参与研究人数是6261人。\n" - ] - } - ], - "source": [ - "print(rag.query(prompt1, param=QueryParam(mode=\"hybrid\")))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fef9d06983da47af", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From acde4ed173614bca12f50a7a2f185b7f6f0ef2c1 Mon Sep 17 00:00:00 2001 From: adikalra <54812001+AdiKalra@users.noreply.github.com> Date: Thu, 9 Jan 2025 17:20:24 +0530 Subject: [PATCH 21/21] Add custom chunking function. --- lightrag/lightrag.py | 7 ++++++- lightrag/operate.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 9905ee74..596fbdbf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -187,6 +187,10 @@ class LightRAG: # Add new field for document status storage type doc_status_storage: str = field(default="JsonDocStatusStorage") + # Custom Chunking Function + chunking_func: callable = chunking_by_token_size + chunking_func_kwargs: dict = field(default_factory=dict) + def __post_init__(self): log_file = os.path.join("lightrag.log") set_logger(log_file) @@ -388,13 +392,14 @@ class LightRAG: **dp, "full_doc_id": doc_id, } - for dp in chunking_by_token_size( + for dp in self.chunking_func( doc["content"], split_by_character=split_by_character, split_by_character_only=split_by_character_only, overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, tiktoken_model=self.tiktoken_model_name, + **self.chunking_func_kwargs, ) } diff --git a/lightrag/operate.py b/lightrag/operate.py index 09871659..7216c07f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -39,6 +39,7 @@ def chunking_by_token_size( overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o", + **kwargs, ): tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results = []