updated the pipe

This commit is contained in:
Yannick Stephan
2025-02-09 13:54:04 +01:00
parent acbe3e2ff2
commit 1c7d14ef76

View File

@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
from dataclasses import asdict, dataclass, field
from datetime import datetime
from functools import partial
from typing import Any, Callable, Optional, Type, Union, cast
from typing import Any, Callable, Coroutine, Optional, Type, Union, cast
import traceback
from .operate import (
chunking_by_token_size,
@@ -561,18 +561,26 @@ class LightRAG:
]
for i, el in enumerate(batch_docs_list):
items = ((k, v) for d in el for k, v in d.items())
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
doc_status: dict[str, Any] = {
"status": DocStatus.PROCESSING,
"updated_at": datetime.now().isoformat(),
}
for doc_id, doc in tqdm_async(
items,
desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}",
):
doc_status: dict[str, Any] = {
doc_status.update(
{
"content_summary": doc["content_summary"],
"content_length": doc["content_length"],
"status": DocStatus.PROCESSING,
"created_at": doc["created_at"],
"updated_at": datetime.now().isoformat(),
}
try:
)
await self.doc_status.upsert({doc_id: doc_status})
# Generate chunks from document
@@ -590,19 +598,35 @@ class LightRAG:
self.tiktoken_model_name,
)
}
await self.chunks_vdb.upsert(chunks)
# Update status with chunks information
try:
# If fails it's failed on full doc and text chunks upset
if doc["status"] != DocStatus.FAILED:
# Ensure chunk insertion and graph processing happen sequentially
await self._process_entity_relation_graph(chunks)
await self.chunks_vdb.upsert(chunks)
except Exception as e:
doc_status.update(
{
"status": DocStatus.PENDING,
"error": str(e),
"updated_at": datetime.now().isoformat(),
}
)
await self.doc_status.upsert({doc_id: doc_status})
if doc_id not in full_docs_new_docs_ids:
await self.full_docs.upsert(
{doc_id: {"content": doc["content"]}}
tasks[doc_id].append(
self.full_docs.upsert({doc_id: {"content": doc["content"]}})
)
if doc_id not in text_chunks_new_docs_ids:
await self.text_chunks.upsert(chunks)
tasks[doc_id].append(self.text_chunks.upsert(chunks))
for doc_id, task in tasks.items():
try:
await asyncio.gather(*task)
# Update document status
doc_status.update(
{
"status": DocStatus.PROCESSED,