Merge remote-tracking branch 'origin/main'

# Conflicts: # README.md
2025-01-11 10:40:09 +08:00
parent d03d6f5fc5 a65f0026ff
commit 1998a5b204
8 changed files with 1521 additions and 185 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2025 Gustavo Ye
+Copyright (c) 2025 LarFii
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
    </p>
     <p>
          <img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
-        <img src="https://img.shields.io/badge/python->=3.10-blue">
+        <img src="https://img.shields.io/badge/python-3.10-blue">
        <a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
        <a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
    </p>
@@ -637,7 +637,7 @@ if __name__ == "__main__":
 | **llm\_model\_kwargs**                       | `dict` | Additional parameters for LLM generation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |                                                                                                             |
 | **vector\_db\_storage\_cls\_kwargs**         | `dict` | Additional parameters for vector database (currently not used)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |                                                                                                             |
 | **enable\_llm\_cache**                       | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | `TRUE`                                                                                                      |
-| **enable\_llm\_cache\_for\_entity\_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | `FALSE`                                                                                                     |
+| **enable\_llm\_cache\_for\_entity\_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | `TRUE`                                                                                                     |
 | **addon\_params**                            | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese", "entity_types": ["organization", "person", "geo", "event"], "insert_batch_size": 10}`: sets example limit, output language, and batch size for document processing                                                                                                                                                                                                                                                                                                                                                                                                                            | `example_number: all examples, language: English, insert_batch_size: 10`                                    |
 | **convert\_response\_to\_json\_func**        | `callable` | Not used                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | `convert_response_to_json`                                                                                  |
 | **embedding\_cache\_config**                 | `dict` | Configuration for question-answer caching. Contains three parameters:<br>- `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers.<br>- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.<br>- `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}`                         |
@@ -892,69 +892,6 @@ def extract_queries(file_path):
 ```
 </details>
 ## Code Structure
 ```python
 .
 ├── .github/
 │   ├── workflows/
 │   │   └── linting.yaml
 ├── examples/
 │   ├── batch_eval.py
 │   ├── generate_query.py
 │   ├── graph_visual_with_html.py
 │   ├── graph_visual_with_neo4j.py
 │   ├── insert_custom_kg.py
 │   ├── lightrag_api_openai_compatible_demo.py
 │   ├── lightrag_api_oracle_demo..py
 │   ├── lightrag_azure_openai_demo.py
 │   ├── lightrag_bedrock_demo.py
 │   ├── lightrag_hf_demo.py
 │   ├── lightrag_lmdeploy_demo.py
 │   ├── lightrag_nvidia_demo.py
 │   ├── lightrag_ollama_demo.py
 │   ├── lightrag_openai_compatible_demo.py
 │   ├── lightrag_openai_demo.py
 │   ├── lightrag_oracle_demo.py
 │   ├── lightrag_siliconcloud_demo.py
 │   └── vram_management_demo.py
 ├── lightrag/
 │   ├── api/
 │   │   ├── lollms_lightrag_server.py
 │   │   ├── ollama_lightrag_server.py
 │   │   ├── openai_lightrag_server.py
 │   │   ├── azure_openai_lightrag_server.py
 │   │   └── requirements.txt
 │   ├── kg/
 │   │   ├── __init__.py
 │   │   ├── oracle_impl.py
 │   │   └── neo4j_impl.py
 │   ├── __init__.py
 │   ├── base.py
 │   ├── lightrag.py
 │   ├── llm.py
 │   ├── operate.py
 │   ├── prompt.py
 │   ├── storage.py
 │   └── utils.py
 ├── reproduce/
 │   ├── Step_0.py
 │   ├── Step_1_openai_compatible.py
 │   ├── Step_1.py
 │   ├── Step_2.py
 │   ├── Step_3_openai_compatible.py
 │   └── Step_3.py
 ├── .gitignore
 ├── .pre-commit-config.yaml
 ├── get_all_edges_nx.py
 ├── LICENSE
 ├── README.md
 ├── requirements.txt
 ├── setup.py
 ├── test_neo4j.py
 └── test.py
 ```
 ## Install with API Support
 LightRAG provides optional API support through FastAPI servers that add RAG capabilities to existing LLM services. You can install LightRAG with API support in two ways:
--- a/contributor-README.md
+++ b/contributor-README.md
--- a/examples/test_split_by_character.ipynb
+++ b/examples/test_split_by_character.ipynb
--- a/lightrag/init.py
+++ b/lightrag/init.py
@@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
-__version__ = "1.0.9"
+__version__ = "1.1.0"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"
--- a/lightrag/lightrag.py
+++ b/lightrag/lightrag.py
@@ -45,6 +45,7 @@ from .storage import (
 from .prompt import GRAPH_FIELD_SEP
 # future KG integrations
 # from .kg.ArangoDB_impl import (
@@ -177,7 +178,7 @@ class LightRAG:
    enable_llm_cache: bool = True
    # Sometimes there are some reason the LLM failed at Extracting Entities, and we want to continue without LLM cost, we can use this flag
-    enable_llm_cache_for_entity_extract: bool = False
+    enable_llm_cache_for_entity_extract: bool = True
    # extension
    addon_params: dict = field(default_factory=dict)
@@ -186,6 +187,10 @@ class LightRAG:
    # Add new field for document status storage type
    doc_status_storage: str = field(default="JsonDocStatusStorage")
    # Custom Chunking Function
    chunking_func: callable = chunking_by_token_size
    chunking_func_kwargs: dict = field(default_factory=dict)
    def __post_init__(self):
        log_file = os.path.join("lightrag.log")
        set_logger(log_file)
@@ -313,15 +318,25 @@ class LightRAG:
            "JsonDocStatusStorage": JsonDocStatusStorage,
        }
-    def insert(self, string_or_strings):
+    def insert(
        self, string_or_strings, split_by_character=None, split_by_character_only=False
    ):
        loop = always_get_an_event_loop()
-        return loop.run_until_complete(self.ainsert(string_or_strings))
+        return loop.run_until_complete(
            self.ainsert(string_or_strings, split_by_character, split_by_character_only)
        )
-    async def ainsert(self, string_or_strings):
+    async def ainsert(
        self, string_or_strings, split_by_character=None, split_by_character_only=False
    ):
        """Insert documents with checkpoint support
        Args:
            string_or_strings: Single document string or list of document strings
            split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
            chunk_size, split the sub chunk by token size.
            split_by_character_only: if split_by_character_only is True, split the string by character only, when
            split_by_character is None, this parameter is ignored.
        """
        if isinstance(string_or_strings, str):
            string_or_strings = [string_or_strings]
@@ -377,11 +392,14 @@ class LightRAG:
                            **dp,
                            "full_doc_id": doc_id,
                        }
-                        for dp in chunking_by_token_size(
+                        for dp in self.chunking_func(
                            doc["content"],
                            split_by_character=split_by_character,
                            split_by_character_only=split_by_character_only,
                            overlap_token_size=self.chunk_overlap_token_size,
                            max_token_size=self.chunk_token_size,
                            tiktoken_model=self.tiktoken_model_name,
                            **self.chunking_func_kwargs,
                        )
                    }
@@ -453,6 +471,73 @@ class LightRAG:
                    # Ensure all indexes are updated after each document
                    await self._insert_done()
    def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
        loop = always_get_an_event_loop()
        return loop.run_until_complete(
            self.ainsert_custom_chunks(full_text, text_chunks)
        )
    async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
        update_storage = False
        try:
            doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
            new_docs = {doc_key: {"content": full_text.strip()}}
            _add_doc_keys = await self.full_docs.filter_keys([doc_key])
            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
            if not len(new_docs):
                logger.warning("This document is already in the storage.")
                return
            update_storage = True
            logger.info(f"[New Docs] inserting {len(new_docs)} docs")
            inserting_chunks = {}
            for chunk_text in text_chunks:
                chunk_text_stripped = chunk_text.strip()
                chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
                inserting_chunks[chunk_key] = {
                    "content": chunk_text_stripped,
                    "full_doc_id": doc_key,
                }
            _add_chunk_keys = await self.text_chunks.filter_keys(
                list(inserting_chunks.keys())
            )
            inserting_chunks = {
                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
            }
            if not len(inserting_chunks):
                logger.warning("All chunks are already in the storage.")
                return
            logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
            await self.chunks_vdb.upsert(inserting_chunks)
            logger.info("[Entity Extraction]...")
            maybe_new_kg = await extract_entities(
                inserting_chunks,
                knowledge_graph_inst=self.chunk_entity_relation_graph,
                entity_vdb=self.entities_vdb,
                relationships_vdb=self.relationships_vdb,
                global_config=asdict(self),
            )
            if maybe_new_kg is None:
                logger.warning("No new entities and relationships found")
                return
            else:
                self.chunk_entity_relation_graph = maybe_new_kg
            await self.full_docs.upsert(new_docs)
            await self.text_chunks.upsert(inserting_chunks)
        finally:
            if update_storage:
                await self._insert_done()
    async def _insert_done(self):
        tasks = []
        for storage_inst in [
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -4,7 +4,6 @@ import re
 from tqdm.asyncio import tqdm as tqdm_async
 from typing import Union
 from collections import Counter, defaultdict
 import warnings
 from .utils import (
    logger,
    clean_str,
@@ -34,10 +33,48 @@ import time
 def chunking_by_token_size(
-    content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o"
+    content: str,
    split_by_character=None,
    split_by_character_only=False,
    overlap_token_size=128,
    max_token_size=1024,
    tiktoken_model="gpt-4o",
    **kwargs,
 ):
    tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
    results = []
    if split_by_character:
        raw_chunks = content.split(split_by_character)
        new_chunks = []
        if split_by_character_only:
            for chunk in raw_chunks:
                _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
                new_chunks.append((len(_tokens), chunk))
        else:
            for chunk in raw_chunks:
                _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
                if len(_tokens) > max_token_size:
                    for start in range(
                        0, len(_tokens), max_token_size - overlap_token_size
                    ):
                        chunk_content = decode_tokens_by_tiktoken(
                            _tokens[start : start + max_token_size],
                            model_name=tiktoken_model,
                        )
                        new_chunks.append(
                            (min(max_token_size, len(_tokens) - start), chunk_content)
                        )
                else:
                    new_chunks.append((len(_tokens), chunk))
        for index, (_len, chunk) in enumerate(new_chunks):
            results.append(
                {
                    "tokens": _len,
                    "content": chunk.strip(),
                    "chunk_order_index": index,
                }
            )
    else:
        for index, start in enumerate(
            range(0, len(tokens), max_token_size - overlap_token_size)
        ):
@@ -574,15 +611,22 @@ async def kg_query(
        logger.warning("low_level_keywords and high_level_keywords is empty")
        return PROMPTS["fail_response"]
    if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
-        logger.warning("low_level_keywords is empty")
+        logger.warning(
-        return PROMPTS["fail_response"]
+            "low_level_keywords is empty, switching from %s mode to global mode",
-    else:
+            query_param.mode,
-        ll_keywords = ", ".join(ll_keywords)
+        )
        query_param.mode = "global"
    if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
-        logger.warning("high_level_keywords is empty")
+        logger.warning(
-        return PROMPTS["fail_response"]
+            "high_level_keywords is empty, switching from %s mode to local mode",
-    else:
+            query_param.mode,
-        hl_keywords = ", ".join(hl_keywords)
+        )
        query_param.mode = "local"
    ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
    hl_keywords = ", ".join(hl_keywords) if hl_keywords else ""
    logger.info("Using %s mode for query processing", query_param.mode)
    # Build context
    keywords = [ll_keywords, hl_keywords]
@@ -648,78 +692,52 @@ async def _build_query_context(
    # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
    # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
-    ll_kewwords, hl_keywrds = query[0], query[1]
+    ll_keywords, hl_keywords = query[0], query[1]
-    if query_param.mode in ["local", "hybrid"]:
+
-        if ll_kewwords == "":
+    if query_param.mode == "local":
-            ll_entities_context, ll_relations_context, ll_text_units_context = (
+        entities_context, relations_context, text_units_context = await _get_node_data(
-                "",
+            ll_keywords,
                "",
                "",
            )
            warnings.warn(
                "Low Level context is None. Return empty Low entity/relationship/source"
            )
            query_param.mode = "global"
        else:
            (
                ll_entities_context,
                ll_relations_context,
                ll_text_units_context,
            ) = await _get_node_data(
                ll_kewwords,
            knowledge_graph_inst,
            entities_vdb,
            text_chunks_db,
            query_param,
        )
-    if query_param.mode in ["global", "hybrid"]:
+    elif query_param.mode == "global":
-        if hl_keywrds == "":
+        entities_context, relations_context, text_units_context = await _get_edge_data(
-            hl_entities_context, hl_relations_context, hl_text_units_context = (
+            hl_keywords,
-                "",
+            knowledge_graph_inst,
-                "",
+            relationships_vdb,
-                "",
+            text_chunks_db,
-            )
+            query_param,
-            warnings.warn(
+        )
-                "High Level context is None. Return empty High entity/relationship/source"
+    else:  # hybrid mode
-            )
+        (
-            query_param.mode = "local"
+            ll_entities_context,
-        else:
+            ll_relations_context,
-            (
+            ll_text_units_context,
-                hl_entities_context,
+        ) = await _get_node_data(
-                hl_relations_context,
+            ll_keywords,
-                hl_text_units_context,
+            knowledge_graph_inst,
-            ) = await _get_edge_data(
+            entities_vdb,
-                hl_keywrds,
+            text_chunks_db,
            query_param,
        )
        (
            hl_entities_context,
            hl_relations_context,
            hl_text_units_context,
        ) = await _get_edge_data(
            hl_keywords,
            knowledge_graph_inst,
            relationships_vdb,
            text_chunks_db,
            query_param,
        )
            if (
                hl_entities_context == ""
                and hl_relations_context == ""
                and hl_text_units_context == ""
            ):
                logger.warn("No high level context found. Switching to local mode.")
                query_param.mode = "local"
    if query_param.mode == "hybrid":
        entities_context, relations_context, text_units_context = combine_contexts(
            [hl_entities_context, ll_entities_context],
            [hl_relations_context, ll_relations_context],
            [hl_text_units_context, ll_text_units_context],
        )
    elif query_param.mode == "local":
        entities_context, relations_context, text_units_context = (
            ll_entities_context,
            ll_relations_context,
            ll_text_units_context,
        )
    elif query_param.mode == "global":
        entities_context, relations_context, text_units_context = (
            hl_entities_context,
            hl_relations_context,
            hl_text_units_context,
        )
    return f"""
 -----Entities-----
 ```csv
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,38 +1,38 @@
 accelerate
-aioboto3~=13.3.0
+aioboto3
-aiofiles~=24.1.0
+aiofiles
-aiohttp~=3.11.11
+aiohttp
-asyncpg~=0.30.0
+asyncpg
 # database packages
 graspologic
 gremlinpython
 hnswlib
 nano-vectordb
-neo4j~=5.27.0
+neo4j
-networkx~=3.2.1
+networkx
-numpy~=2.2.0
+numpy
-ollama~=0.4.4
+ollama
-openai~=1.58.1
+openai
 oracledb
-psycopg-pool~=3.2.4
+psycopg-pool
-psycopg[binary,pool]~=3.2.3
+psycopg[binary,pool]
-pydantic~=2.10.4
+pydantic
 pymilvus
 pymongo
 pymysql
-python-dotenv~=1.0.1
+python-dotenv
-pyvis~=0.3.2
+pyvis
-setuptools~=70.0.0
+setuptools
 # lmdeploy[all]
-sqlalchemy~=2.0.36
+sqlalchemy
-tenacity~=9.0.0
+tenacity
 # LLM packages
-tiktoken~=0.8.0
+tiktoken
-torch~=2.5.1+cu121
+torch
-tqdm~=4.67.1
+tqdm
-transformers~=4.47.1
+transformers
 xxhash