feat: 新增ini文件读取数据库配置方式,方便生产环境,修改Lightrag ainsert方法_add_doc_keys获取方式,原来只过滤存在的,但这会让失败的文档无法再次存储,新增--chunk_size和--chunk_overlap_size方便生产环境,新增llm_binding:openai-ollama 方便用openai的同时使用ollama embedding

This commit is contained in:
hyb
2025-01-23 22:58:57 +08:00
parent 3c5ced835e
commit ff71952c8c
5 changed files with 111 additions and 7 deletions

View File

@@ -361,7 +361,13 @@ class LightRAG:
}
# 3. Filter out already processed documents
_add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys()))
# _add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys()))
_add_doc_keys = {
doc_id
for doc_id in new_docs.keys()
if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
or current_doc["status"] == DocStatus.FAILED
}
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
if not new_docs:
@@ -573,7 +579,7 @@ class LightRAG:
_not_stored_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys()))
if len(_not_stored_doc_keys) < len(new_docs):
logger.info(
f"Skipping {len(new_docs)-len(_not_stored_doc_keys)} already existing documents"
f"Skipping {len(new_docs) - len(_not_stored_doc_keys)} already existing documents"
)
new_docs = {k: v for k, v in new_docs.items() if k in _not_stored_doc_keys}
@@ -618,7 +624,7 @@ class LightRAG:
batch_docs = dict(list(new_docs.items())[i : i + batch_size])
for doc_id, doc in tqdm_async(
batch_docs.items(),
desc=f"Level 1 - Spliting doc in batch {i//batch_size + 1}",
desc=f"Level 1 - Spliting doc in batch {i // batch_size + 1}",
):
try:
# Generate chunks from document