updated the pipe

This commit is contained in:
Yannick Stephan
2025-02-09 13:54:04 +01:00
parent acbe3e2ff2
commit 1c7d14ef76

View File

@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
from dataclasses import asdict, dataclass, field from dataclasses import asdict, dataclass, field
from datetime import datetime from datetime import datetime
from functools import partial from functools import partial
from typing import Any, Callable, Optional, Type, Union, cast from typing import Any, Callable, Coroutine, Optional, Type, Union, cast
import traceback import traceback
from .operate import ( from .operate import (
chunking_by_token_size, chunking_by_token_size,
@@ -561,18 +561,26 @@ class LightRAG:
] ]
for i, el in enumerate(batch_docs_list): for i, el in enumerate(batch_docs_list):
items = ((k, v) for d in el for k, v in d.items()) items = ((k, v) for d in el for k, v in d.items())
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
doc_status: dict[str, Any] = {
"status": DocStatus.PROCESSING,
"updated_at": datetime.now().isoformat(),
}
for doc_id, doc in tqdm_async( for doc_id, doc in tqdm_async(
items, items,
desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}", desc=f"Level 1 - Spliting doc in batch {i // len(batch_docs_list) + 1}",
): ):
doc_status: dict[str, Any] = { doc_status.update(
{
"content_summary": doc["content_summary"], "content_summary": doc["content_summary"],
"content_length": doc["content_length"], "content_length": doc["content_length"],
"status": DocStatus.PROCESSING,
"created_at": doc["created_at"], "created_at": doc["created_at"],
"updated_at": datetime.now().isoformat(),
} }
try: )
await self.doc_status.upsert({doc_id: doc_status}) await self.doc_status.upsert({doc_id: doc_status})
# Generate chunks from document # Generate chunks from document
@@ -590,19 +598,35 @@ class LightRAG:
self.tiktoken_model_name, self.tiktoken_model_name,
) )
} }
await self.chunks_vdb.upsert(chunks) try:
# If fails it's failed on full doc and text chunks upset
# Update status with chunks information if doc["status"] != DocStatus.FAILED:
# Ensure chunk insertion and graph processing happen sequentially
await self._process_entity_relation_graph(chunks) await self._process_entity_relation_graph(chunks)
await self.chunks_vdb.upsert(chunks)
except Exception as e:
doc_status.update(
{
"status": DocStatus.PENDING,
"error": str(e),
"updated_at": datetime.now().isoformat(),
}
)
await self.doc_status.upsert({doc_id: doc_status})
if doc_id not in full_docs_new_docs_ids: if doc_id not in full_docs_new_docs_ids:
await self.full_docs.upsert( tasks[doc_id].append(
{doc_id: {"content": doc["content"]}} self.full_docs.upsert({doc_id: {"content": doc["content"]}})
) )
if doc_id not in text_chunks_new_docs_ids: if doc_id not in text_chunks_new_docs_ids:
await self.text_chunks.upsert(chunks) tasks[doc_id].append(self.text_chunks.upsert(chunks))
for doc_id, task in tasks.items():
try:
await asyncio.gather(*task)
# Update document status
doc_status.update( doc_status.update(
{ {
"status": DocStatus.PROCESSED, "status": DocStatus.PROCESSED,