Merge branch 'main' into fix-neo4j-duplicate-nodes

This commit is contained in:
yangdx
2025-03-11 18:05:58 +08:00
9 changed files with 382 additions and 150 deletions

View File

@@ -176,6 +176,8 @@ class QueryParam:
"""Maximum number of tokens allocated for relationship descriptions in global retrieval."""
max_token_for_local_context: int = 4000
"""Maximum number of tokens allocated for entity descriptions in local retrieval."""
ids: list[str] | None = None # ONLY SUPPORTED FOR PG VECTOR DBs
"""List of ids to filter the RAG."""
...
```

View File

@@ -1,5 +1,5 @@
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
__version__ = "1.2.4"
__version__ = "1.2.5"
__author__ = "Zirui Guo"
__url__ = "https://github.com/HKUDS/LightRAG"

View File

@@ -81,6 +81,9 @@ class QueryParam:
history_turns: int = 3
"""Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
ids: list[str] | None = None
"""List of ids to filter the results."""
@dataclass
class StorageNameSpace(ABC):
@@ -107,7 +110,9 @@ class BaseVectorStorage(StorageNameSpace, ABC):
meta_fields: set[str] = field(default_factory=set)
@abstractmethod
async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
async def query(
self, query: str, top_k: int, ids: list[str] | None = None
) -> list[dict[str, Any]]:
"""Query the vector storage and retrieve top_k results."""
@abstractmethod

View File

@@ -438,6 +438,8 @@ class PGVectorStorage(BaseVectorStorage):
"entity_name": item["entity_name"],
"content": item["content"],
"content_vector": json.dumps(item["__vector__"].tolist()),
"chunk_id": item["source_id"],
# TODO: add document_id
}
return upsert_sql, data
@@ -450,6 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
"target_id": item["tgt_id"],
"content": item["content"],
"content_vector": json.dumps(item["__vector__"].tolist()),
"chunk_id": item["source_id"],
# TODO: add document_id
}
return upsert_sql, data
@@ -492,13 +496,20 @@ class PGVectorStorage(BaseVectorStorage):
await self.db.execute(upsert_sql, data)
#################### query method ###############
async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
async def query(
self, query: str, top_k: int, ids: list[str] | None = None
) -> list[dict[str, Any]]:
embeddings = await self.embedding_func([query])
embedding = embeddings[0]
embedding_string = ",".join(map(str, embedding))
if ids:
formatted_ids = ",".join(f"'{id}'" for id in ids)
else:
formatted_ids = "NULL"
sql = SQL_TEMPLATES[self.base_namespace].format(
embedding_string=embedding_string
embedding_string=embedding_string, doc_ids=formatted_ids
)
params = {
"workspace": self.db.workspace,
@@ -1491,6 +1502,7 @@ TABLES = {
content_vector VECTOR,
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP,
chunk_id VARCHAR(255) NULL,
CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
)"""
},
@@ -1504,6 +1516,7 @@ TABLES = {
content_vector VECTOR,
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
update_time TIMESTAMP,
chunk_id VARCHAR(255) NULL,
CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
)"""
},
@@ -1586,8 +1599,9 @@ SQL_TEMPLATES = {
content_vector=EXCLUDED.content_vector,
update_time = CURRENT_TIMESTAMP
""",
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector)
VALUES ($1, $2, $3, $4, $5)
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
content_vector, chunk_id)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (workspace,id) DO UPDATE
SET entity_name=EXCLUDED.entity_name,
content=EXCLUDED.content,
@@ -1595,8 +1609,8 @@ SQL_TEMPLATES = {
update_time=CURRENT_TIMESTAMP
""",
"upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
target_id, content, content_vector)
VALUES ($1, $2, $3, $4, $5, $6)
target_id, content, content_vector, chunk_id)
VALUES ($1, $2, $3, $4, $5, $6, $7)
ON CONFLICT (workspace,id) DO UPDATE
SET source_id=EXCLUDED.source_id,
target_id=EXCLUDED.target_id,
@@ -1604,21 +1618,21 @@ SQL_TEMPLATES = {
content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP
""",
# SQL for VectorStorage
"entities": """SELECT entity_name FROM
(SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
WHERE distance>$2 ORDER BY distance DESC LIMIT $3
""",
"relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
(SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_RELATION where workspace=$1)
WHERE distance>$2 ORDER BY distance DESC LIMIT $3
""",
"chunks": """SELECT id FROM
(SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
WHERE distance>$2 ORDER BY distance DESC LIMIT $3
""",
# "entities": """SELECT entity_name FROM
# (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
# FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
# WHERE distance>$2 ORDER BY distance DESC LIMIT $3
# """,
# "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
# (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
# FROM LIGHTRAG_VDB_RELATION where workspace=$1)
# WHERE distance>$2 ORDER BY distance DESC LIMIT $3
# """,
# "chunks": """SELECT id FROM
# (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
# FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
# WHERE distance>$2 ORDER BY distance DESC LIMIT $3
# """,
# DROP tables
"drop_all": """
DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE;
@@ -1642,4 +1656,55 @@ SQL_TEMPLATES = {
"drop_vdb_relation": """
DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE;
""",
"relationships": """
WITH relevant_chunks AS (
SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
)
SELECT source_id as src_id, target_id as tgt_id
FROM (
SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_RELATION r
WHERE r.workspace=$1
AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
) filtered
WHERE distance>$2
ORDER BY distance DESC
LIMIT $3
""",
"entities": """
WITH relevant_chunks AS (
SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
)
SELECT entity_name FROM
(
SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_VDB_ENTITY
where workspace=$1
AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
)
WHERE distance>$2
ORDER BY distance DESC
LIMIT $3
""",
"chunks": """
WITH relevant_chunks AS (
SELECT id as chunk_id
FROM LIGHTRAG_DOC_CHUNKS
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
)
SELECT id FROM
(
SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
FROM LIGHTRAG_DOC_CHUNKS
where workspace=$1
AND id IN (SELECT chunk_id FROM relevant_chunks)
)
WHERE distance>$2
ORDER BY distance DESC
LIMIT $3
""",
}

View File

@@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace
from .operate import (
chunking_by_token_size,
extract_entities,
extract_keywords_only,
kg_query,
kg_query_with_keywords,
mix_kg_vector_query,
naive_query,
query_with_keywords,
)
from .prompt import GRAPH_FIELD_SEP, PROMPTS
from .utils import (
@@ -45,6 +44,9 @@ from .utils import (
encode_string_by_tiktoken,
lazy_external_import,
limit_async_func_call,
get_content_summary,
clean_text,
check_storage_env_vars,
logger,
)
from .types import KnowledgeGraph
@@ -309,7 +311,7 @@ class LightRAG:
# Verify storage implementation compatibility
verify_storage_implementation(storage_type, storage_name)
# Check environment variables
# self.check_storage_env_vars(storage_name)
check_storage_env_vars(storage_name)
# Ensure vector_db_storage_cls_kwargs has required fields
self.vector_db_storage_cls_kwargs = {
@@ -536,11 +538,6 @@ class LightRAG:
storage_class = lazy_external_import(import_path, storage_name)
return storage_class
@staticmethod
def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace"""
return text.strip().replace("\x00", "")
def insert(
self,
input: str | list[str],
@@ -602,8 +599,8 @@ class LightRAG:
update_storage = False
try:
# Clean input texts
full_text = self.clean_text(full_text)
text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
full_text = clean_text(full_text)
text_chunks = [clean_text(chunk) for chunk in text_chunks]
# Process cleaned texts
if doc_id is None:
@@ -682,7 +679,7 @@ class LightRAG:
contents = {id_: doc for id_, doc in zip(ids, input)}
else:
# Clean input text and remove duplicates
input = list(set(self.clean_text(doc) for doc in input))
input = list(set(clean_text(doc) for doc in input))
# Generate contents dict of MD5 hash IDs and documents
contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
@@ -698,7 +695,7 @@ class LightRAG:
new_docs: dict[str, Any] = {
id_: {
"content": content,
"content_summary": self._get_content_summary(content),
"content_summary": get_content_summary(content),
"content_length": len(content),
"status": DocStatus.PENDING,
"created_at": datetime.now().isoformat(),
@@ -1063,7 +1060,7 @@ class LightRAG:
all_chunks_data: dict[str, dict[str, str]] = {}
chunk_to_source_map: dict[str, str] = {}
for chunk_data in custom_kg.get("chunks", []):
chunk_content = self.clean_text(chunk_data["content"])
chunk_content = clean_text(chunk_data["content"])
source_id = chunk_data["source_id"]
tokens = len(
encode_string_by_tiktoken(
@@ -1296,8 +1293,17 @@ class LightRAG:
self, query: str, prompt: str, param: QueryParam = QueryParam()
):
"""
1. Extract keywords from the 'query' using new function in operate.py.
2. Then run the standard aquery() flow with the final prompt (formatted_question).
Query with separate keyword extraction step.
This method extracts keywords from the query first, then uses them for the query.
Args:
query: User query
prompt: Additional prompt for the query
param: Query parameters
Returns:
Query response
"""
loop = always_get_an_event_loop()
return loop.run_until_complete(
@@ -1308,66 +1314,29 @@ class LightRAG:
self, query: str, prompt: str, param: QueryParam = QueryParam()
) -> str | AsyncIterator[str]:
"""
1. Calls extract_keywords_only to get HL/LL keywords from 'query'.
2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed.
Async version of query_with_separate_keyword_extraction.
Args:
query: User query
prompt: Additional prompt for the query
param: Query parameters
Returns:
Query response or async iterator
"""
# ---------------------
# STEP 1: Keyword Extraction
# ---------------------
hl_keywords, ll_keywords = await extract_keywords_only(
text=query,
response = await query_with_keywords(
query=query,
prompt=prompt,
param=param,
knowledge_graph_inst=self.chunk_entity_relation_graph,
entities_vdb=self.entities_vdb,
relationships_vdb=self.relationships_vdb,
chunks_vdb=self.chunks_vdb,
text_chunks_db=self.text_chunks,
global_config=asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
hashing_kv=self.llm_response_cache,
)
param.hl_keywords = hl_keywords
param.ll_keywords = ll_keywords
# ---------------------
# STEP 2: Final Query Logic
# ---------------------
# Create a new string with the prompt and the keywords
ll_keywords_str = ", ".join(ll_keywords)
hl_keywords_str = ", ".join(hl_keywords)
formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
if param.mode in ["local", "global", "hybrid"]:
response = await kg_query_with_keywords(
formatted_question,
self.chunk_entity_relation_graph,
self.entities_vdb,
self.relationships_vdb,
self.text_chunks,
param,
asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
)
elif param.mode == "naive":
response = await naive_query(
formatted_question,
self.chunks_vdb,
self.text_chunks,
param,
asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
)
elif param.mode == "mix":
response = await mix_kg_vector_query(
formatted_question,
self.chunk_entity_relation_graph,
self.entities_vdb,
self.relationships_vdb,
self.chunks_vdb,
self.text_chunks,
param,
asdict(self),
hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
)
else:
raise ValueError(f"Unknown mode {param.mode}")
await self._query_done()
return response
@@ -1465,21 +1434,6 @@ class LightRAG:
]
)
def _get_content_summary(self, content: str, max_length: int = 100) -> str:
"""Get summary of document content
Args:
content: Original document content
max_length: Maximum length of summary
Returns:
Truncated content with ellipsis if needed
"""
content = content.strip()
if len(content) <= max_length:
return content
return content[:max_length] + "..."
async def get_processing_status(self) -> dict[str, int]:
"""Get current document processing status counts
@@ -2622,6 +2576,12 @@ class LightRAG:
# 9. Delete source entities
for entity_name in source_entities:
if entity_name == target_entity:
logger.info(
f"Skipping deletion of '{entity_name}' as it's also the target entity"
)
continue
# Delete entity node from knowledge graph
await self.chunk_entity_relation_graph.delete_node(entity_name)

View File

@@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache(
openai_async_client = AsyncAzureOpenAI(
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
azure_deployment=model,
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)
@@ -136,6 +137,7 @@ async def azure_openai_embed(
openai_async_client = AsyncAzureOpenAI(
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
azure_deployment=model,
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

View File

@@ -141,18 +141,36 @@ async def _handle_single_entity_extraction(
):
if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
return None
# add this record as a node in the G
# Clean and validate entity name
entity_name = clean_str(record_attributes[1]).strip('"')
if not entity_name.strip():
logger.warning(
f"Entity extraction error: empty entity name in: {record_attributes}"
)
return None
# Clean and validate entity type
entity_type = clean_str(record_attributes[2]).strip('"')
if not entity_type.strip() or entity_type.startswith('("'):
logger.warning(
f"Entity extraction error: invalid entity type in: {record_attributes}"
)
return None
# Clean and validate description
entity_description = clean_str(record_attributes[3]).strip('"')
entity_source_id = chunk_key
if not entity_description.strip():
logger.warning(
f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
)
return None
return dict(
entity_name=entity_name,
entity_type=entity_type,
description=entity_description,
source_id=entity_source_id,
source_id=chunk_key,
metadata={"created_at": time.time()},
)
@@ -438,47 +456,22 @@ async def extract_entities(
else:
return await use_llm_func(input_text)
async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
""" "Prpocess a single chunk
async def _process_extraction_result(result: str, chunk_key: str):
"""Process a single extraction result (either initial or gleaning)
Args:
chunk_key_dp (tuple[str, TextChunkSchema]):
("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
result (str): The extraction result to process
chunk_key (str): The chunk key for source tracking
Returns:
tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
"""
nonlocal processed_chunks
chunk_key = chunk_key_dp[0]
chunk_dp = chunk_key_dp[1]
content = chunk_dp["content"]
# hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
hint_prompt = entity_extract_prompt.format(
**context_base, input_text="{input_text}"
).format(**context_base, input_text=content)
final_result = await _user_llm_func_with_cache(hint_prompt)
history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
for now_glean_index in range(entity_extract_max_gleaning):
glean_result = await _user_llm_func_with_cache(
continue_prompt, history_messages=history
)
history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
final_result += glean_result
if now_glean_index == entity_extract_max_gleaning - 1:
break
if_loop_result: str = await _user_llm_func_with_cache(
if_loop_prompt, history_messages=history
)
if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
if if_loop_result != "yes":
break
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
records = split_string_by_multi_markers(
final_result,
result,
[context_base["record_delimiter"], context_base["completion_delimiter"]],
)
maybe_nodes = defaultdict(list)
maybe_edges = defaultdict(list)
for record in records:
record = re.search(r"\((.*)\)", record)
if record is None:
@@ -487,6 +480,7 @@ async def extract_entities(
record_attributes = split_string_by_multi_markers(
record, [context_base["tuple_delimiter"]]
)
if_entities = await _handle_single_entity_extraction(
record_attributes, chunk_key
)
@@ -501,6 +495,62 @@ async def extract_entities(
maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
if_relation
)
return maybe_nodes, maybe_edges
async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
"""Process a single chunk
Args:
chunk_key_dp (tuple[str, TextChunkSchema]):
("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
"""
nonlocal processed_chunks
chunk_key = chunk_key_dp[0]
chunk_dp = chunk_key_dp[1]
content = chunk_dp["content"]
# Get initial extraction
hint_prompt = entity_extract_prompt.format(
**context_base, input_text="{input_text}"
).format(**context_base, input_text=content)
final_result = await _user_llm_func_with_cache(hint_prompt)
history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
# Process initial extraction
maybe_nodes, maybe_edges = await _process_extraction_result(
final_result, chunk_key
)
# Process additional gleaning results
for now_glean_index in range(entity_extract_max_gleaning):
glean_result = await _user_llm_func_with_cache(
continue_prompt, history_messages=history
)
history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
# Process gleaning result separately
glean_nodes, glean_edges = await _process_extraction_result(
glean_result, chunk_key
)
# Merge results
for entity_name, entities in glean_nodes.items():
maybe_nodes[entity_name].extend(entities)
for edge_key, edges in glean_edges.items():
maybe_edges[edge_key].extend(edges)
if now_glean_index == entity_extract_max_gleaning - 1:
break
if_loop_result: str = await _user_llm_func_with_cache(
if_loop_prompt, history_messages=history
)
if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
if if_loop_result != "yes":
break
processed_chunks += 1
entities_count = len(maybe_nodes)
relations_count = len(maybe_edges)
@@ -912,7 +962,10 @@ async def mix_kg_vector_query(
try:
# Reduce top_k for vector search in hybrid mode since we have structured information from KG
mix_topk = min(10, query_param.top_k)
results = await chunks_vdb.query(augmented_query, top_k=mix_topk)
# TODO: add ids to the query
results = await chunks_vdb.query(
augmented_query, top_k=mix_topk, ids=query_param.ids
)
if not results:
return None
@@ -1121,7 +1174,11 @@ async def _get_node_data(
logger.info(
f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
)
results = await entities_vdb.query(query, top_k=query_param.top_k)
results = await entities_vdb.query(
query, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results):
return "", "", ""
# get entity information
@@ -1374,7 +1431,10 @@ async def _get_edge_data(
logger.info(
f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
)
results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
results = await relationships_vdb.query(
keywords, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results):
return "", "", ""
@@ -1623,7 +1683,9 @@ async def naive_query(
if cached_response is not None:
return cached_response
results = await chunks_vdb.query(query, top_k=query_param.top_k)
results = await chunks_vdb.query(
query, top_k=query_param.top_k, ids=query_param.ids
)
if not len(results):
return PROMPTS["fail_response"]
@@ -1854,3 +1916,90 @@ async def kg_query_with_keywords(
)
return response
async def query_with_keywords(
query: str,
prompt: str,
param: QueryParam,
knowledge_graph_inst: BaseGraphStorage,
entities_vdb: BaseVectorStorage,
relationships_vdb: BaseVectorStorage,
chunks_vdb: BaseVectorStorage,
text_chunks_db: BaseKVStorage,
global_config: dict[str, str],
hashing_kv: BaseKVStorage | None = None,
) -> str | AsyncIterator[str]:
"""
Extract keywords from the query and then use them for retrieving information.
1. Extracts high-level and low-level keywords from the query
2. Formats the query with the extracted keywords and prompt
3. Uses the appropriate query method based on param.mode
Args:
query: The user's query
prompt: Additional prompt to prepend to the query
param: Query parameters
knowledge_graph_inst: Knowledge graph storage
entities_vdb: Entities vector database
relationships_vdb: Relationships vector database
chunks_vdb: Document chunks vector database
text_chunks_db: Text chunks storage
global_config: Global configuration
hashing_kv: Cache storage
Returns:
Query response or async iterator
"""
# Extract keywords
hl_keywords, ll_keywords = await extract_keywords_only(
text=query,
param=param,
global_config=global_config,
hashing_kv=hashing_kv,
)
param.hl_keywords = hl_keywords
param.ll_keywords = ll_keywords
# Create a new string with the prompt and the keywords
ll_keywords_str = ", ".join(ll_keywords)
hl_keywords_str = ", ".join(hl_keywords)
formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
# Use appropriate query method based on mode
if param.mode in ["local", "global", "hybrid"]:
return await kg_query_with_keywords(
formatted_question,
knowledge_graph_inst,
entities_vdb,
relationships_vdb,
text_chunks_db,
param,
global_config,
hashing_kv=hashing_kv,
)
elif param.mode == "naive":
return await naive_query(
formatted_question,
chunks_vdb,
text_chunks_db,
param,
global_config,
hashing_kv=hashing_kv,
)
elif param.mode == "mix":
return await mix_kg_vector_query(
formatted_question,
knowledge_graph_inst,
entities_vdb,
relationships_vdb,
chunks_vdb,
text_chunks_db,
param,
global_config,
hashing_kv=hashing_kv,
)
else:
raise ValueError(f"Unknown mode {param.mode}")

View File

@@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key
---Instructions---
- Consider both the current query and relevant conversation history when extracting keywords
- Output the keywords in JSON format
- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output
- The JSON should have two keys:
- "high_level_keywords" for overarching concepts or themes
- "low_level_keywords" for specific entities or details

View File

@@ -890,3 +890,52 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
return cls(*args, **kwargs)
return import_class
def get_content_summary(content: str, max_length: int = 100) -> str:
"""Get summary of document content
Args:
content: Original document content
max_length: Maximum length of summary
Returns:
Truncated content with ellipsis if needed
"""
content = content.strip()
if len(content) <= max_length:
return content
return content[:max_length] + "..."
def clean_text(text: str) -> str:
"""Clean text by removing null bytes (0x00) and whitespace
Args:
text: Input text to clean
Returns:
Cleaned text
"""
return text.strip().replace("\x00", "")
def check_storage_env_vars(storage_name: str) -> None:
"""Check if all required environment variables for storage implementation exist
Args:
storage_name: Storage implementation name
Raises:
ValueError: If required environment variables are missing
"""
from lightrag.kg import STORAGE_ENV_REQUIREMENTS
required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
missing_vars = [var for var in required_vars if var not in os.environ]
if missing_vars:
raise ValueError(
f"Storage implementation '{storage_name}' requires the following "
f"environment variables: {', '.join(missing_vars)}"
)