Merge branch 'main' into main

This commit is contained in:
zrguo
2025-01-09 15:28:57 +08:00
committed by GitHub
3 changed files with 125 additions and 78 deletions

View File

@@ -12,7 +12,7 @@
</p> </p>
<p> <p>
<img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' /> <img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
<img src="https://img.shields.io/badge/python->=3.10-blue"> <img src="https://img.shields.io/badge/python-3.10-blue">
<a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a> <a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
<a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a> <a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
</p> </p>

View File

@@ -323,7 +323,7 @@ class LightRAG:
) )
async def ainsert( async def ainsert(
self, string_or_strings, split_by_character, split_by_character_only self, string_or_strings, split_by_character=None, split_by_character_only=False
): ):
"""Insert documents with checkpoint support """Insert documents with checkpoint support
@@ -466,6 +466,73 @@ class LightRAG:
# Ensure all indexes are updated after each document # Ensure all indexes are updated after each document
await self._insert_done() await self._insert_done()
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
loop = always_get_an_event_loop()
return loop.run_until_complete(
self.ainsert_custom_chunks(full_text, text_chunks)
)
async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
update_storage = False
try:
doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
new_docs = {doc_key: {"content": full_text.strip()}}
_add_doc_keys = await self.full_docs.filter_keys([doc_key])
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if not len(new_docs):
logger.warning("This document is already in the storage.")
return
update_storage = True
logger.info(f"[New Docs] inserting {len(new_docs)} docs")
inserting_chunks = {}
for chunk_text in text_chunks:
chunk_text_stripped = chunk_text.strip()
chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
inserting_chunks[chunk_key] = {
"content": chunk_text_stripped,
"full_doc_id": doc_key,
}
_add_chunk_keys = await self.text_chunks.filter_keys(
list(inserting_chunks.keys())
)
inserting_chunks = {
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
}
if not len(inserting_chunks):
logger.warning("All chunks are already in the storage.")
return
logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
await self.chunks_vdb.upsert(inserting_chunks)
logger.info("[Entity Extraction]...")
maybe_new_kg = await extract_entities(
inserting_chunks,
knowledge_graph_inst=self.chunk_entity_relation_graph,
entity_vdb=self.entities_vdb,
relationships_vdb=self.relationships_vdb,
global_config=asdict(self),
)
if maybe_new_kg is None:
logger.warning("No new entities and relationships found")
return
else:
self.chunk_entity_relation_graph = maybe_new_kg
await self.full_docs.upsert(new_docs)
await self.text_chunks.upsert(inserting_chunks)
finally:
if update_storage:
await self._insert_done()
async def _insert_done(self): async def _insert_done(self):
tasks = [] tasks = []
for storage_inst in [ for storage_inst in [

View File

@@ -4,7 +4,6 @@ import re
from tqdm.asyncio import tqdm as tqdm_async from tqdm.asyncio import tqdm as tqdm_async
from typing import Union from typing import Union
from collections import Counter, defaultdict from collections import Counter, defaultdict
import warnings
from .utils import ( from .utils import (
logger, logger,
clean_str, clean_str,
@@ -611,15 +610,22 @@ async def kg_query(
logger.warning("low_level_keywords and high_level_keywords is empty") logger.warning("low_level_keywords and high_level_keywords is empty")
return PROMPTS["fail_response"] return PROMPTS["fail_response"]
if ll_keywords == [] and query_param.mode in ["local", "hybrid"]: if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
logger.warning("low_level_keywords is empty") logger.warning(
return PROMPTS["fail_response"] "low_level_keywords is empty, switching from %s mode to global mode",
else: query_param.mode,
ll_keywords = ", ".join(ll_keywords) )
query_param.mode = "global"
if hl_keywords == [] and query_param.mode in ["global", "hybrid"]: if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
logger.warning("high_level_keywords is empty") logger.warning(
return PROMPTS["fail_response"] "high_level_keywords is empty, switching from %s mode to local mode",
else: query_param.mode,
hl_keywords = ", ".join(hl_keywords) )
query_param.mode = "local"
ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
hl_keywords = ", ".join(hl_keywords) if hl_keywords else ""
logger.info("Using %s mode for query processing", query_param.mode)
# Build context # Build context
keywords = [ll_keywords, hl_keywords] keywords = [ll_keywords, hl_keywords]
@@ -685,78 +691,52 @@ async def _build_query_context(
# ll_entities_context, ll_relations_context, ll_text_units_context = "", "", "" # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
# hl_entities_context, hl_relations_context, hl_text_units_context = "", "", "" # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
ll_kewwords, hl_keywrds = query[0], query[1] ll_keywords, hl_keywords = query[0], query[1]
if query_param.mode in ["local", "hybrid"]:
if ll_kewwords == "": if query_param.mode == "local":
ll_entities_context, ll_relations_context, ll_text_units_context = ( entities_context, relations_context, text_units_context = await _get_node_data(
"", ll_keywords,
"",
"",
)
warnings.warn(
"Low Level context is None. Return empty Low entity/relationship/source"
)
query_param.mode = "global"
else:
(
ll_entities_context,
ll_relations_context,
ll_text_units_context,
) = await _get_node_data(
ll_kewwords,
knowledge_graph_inst, knowledge_graph_inst,
entities_vdb, entities_vdb,
text_chunks_db, text_chunks_db,
query_param, query_param,
) )
if query_param.mode in ["global", "hybrid"]: elif query_param.mode == "global":
if hl_keywrds == "": entities_context, relations_context, text_units_context = await _get_edge_data(
hl_entities_context, hl_relations_context, hl_text_units_context = ( hl_keywords,
"", knowledge_graph_inst,
"", relationships_vdb,
"", text_chunks_db,
) query_param,
warnings.warn( )
"High Level context is None. Return empty High entity/relationship/source" else: # hybrid mode
) (
query_param.mode = "local" ll_entities_context,
else: ll_relations_context,
( ll_text_units_context,
hl_entities_context, ) = await _get_node_data(
hl_relations_context, ll_keywords,
hl_text_units_context, knowledge_graph_inst,
) = await _get_edge_data( entities_vdb,
hl_keywrds, text_chunks_db,
query_param,
)
(
hl_entities_context,
hl_relations_context,
hl_text_units_context,
) = await _get_edge_data(
hl_keywords,
knowledge_graph_inst, knowledge_graph_inst,
relationships_vdb, relationships_vdb,
text_chunks_db, text_chunks_db,
query_param, query_param,
) )
if (
hl_entities_context == ""
and hl_relations_context == ""
and hl_text_units_context == ""
):
logger.warn("No high level context found. Switching to local mode.")
query_param.mode = "local"
if query_param.mode == "hybrid":
entities_context, relations_context, text_units_context = combine_contexts( entities_context, relations_context, text_units_context = combine_contexts(
[hl_entities_context, ll_entities_context], [hl_entities_context, ll_entities_context],
[hl_relations_context, ll_relations_context], [hl_relations_context, ll_relations_context],
[hl_text_units_context, ll_text_units_context], [hl_text_units_context, ll_text_units_context],
) )
elif query_param.mode == "local":
entities_context, relations_context, text_units_context = (
ll_entities_context,
ll_relations_context,
ll_text_units_context,
)
elif query_param.mode == "global":
entities_context, relations_context, text_units_context = (
hl_entities_context,
hl_relations_context,
hl_text_units_context,
)
return f""" return f"""
-----Entities----- -----Entities-----
```csv ```csv