cleaned docs

This commit is contained in:
Yannick Stephan
2025-02-09 14:36:49 +01:00
parent 37943a65a3
commit abcdcd5a73
2 changed files with 31 additions and 27 deletions

View File

@@ -121,9 +121,8 @@ async def main():
texts = [x for x in all_text.split("\n") if x]
# New mode use pipeline
await rag.apipeline_process_documents(texts)
await rag.apipeline_process_chunks()
await rag.apipeline_process_extract_graph()
await rag.apipeline_enqueue_documents(texts)
await rag.apipeline_process_enqueue_documents()
# Old method use ainsert
# await rag.ainsert(texts)

View File

@@ -395,7 +395,9 @@ class LightRAG:
split_by_character is None, this parameter is ignored.
"""
await self.apipeline_process_documents(string_or_strings)
await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only)
await self.apipeline_process_enqueue_documents(
split_by_character, split_by_character_only
)
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
loop = always_get_an_event_loop()
@@ -511,23 +513,23 @@ class LightRAG:
async def _get_pending_documents(self) -> list[str]:
"""Fetch all pending and failed documents."""
to_process_doc_keys: list[str] = []
# Fetch failed documents
failed_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED)
if failed_docs:
to_process_doc_keys.extend([doc["id"] for doc in failed_docs])
# Fetch pending documents
pending_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING)
if pending_docs:
to_process_doc_keys.extend([doc["id"] for doc in pending_docs])
if not to_process_doc_keys:
logger.info("All documents have been processed or are duplicates")
return []
return to_process_doc_keys
async def apipeline_process_enqueue_documents(
self,
split_by_character: str | None = None,
@@ -535,36 +537,39 @@ class LightRAG:
) -> None:
"""
Process pending documents by splitting them into chunks, processing
each chunk for entity and relation extraction, and updating the
each chunk for entity and relation extraction, and updating the
document status.
1. Get all pending and failed documents
2. Split document content into chunks
3. Process each chunk for entity and relation extraction
4. Update the document status
"""
"""
# 1. get all pending and failed documents
pending_doc_ids = await self._get_pending_documents()
if not pending_doc_ids:
logger.info("All documents have been processed or are duplicates")
return
# Get allready processed documents (text chunks and full docs)
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(pending_doc_ids)
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(
pending_doc_ids
)
full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids)
# 2. split docs into chunks, insert chunks, update doc status
batch_size = self.addon_params.get("insert_batch_size", 10)
batch_docs_list = [
pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size)
pending_doc_ids[i : i + batch_size]
for i in range(0, len(pending_doc_ids), batch_size)
]
# 3. iterate over batches
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
for batch_idx, doc_ids in tqdm_async(
enumerate(batch_docs_list),
desc=f"Process Batches",
desc="Process Batches",
):
# 4. iterate over batch
for doc_id in tqdm_async(
@@ -580,7 +585,7 @@ class LightRAG:
"updated_at": datetime.now().isoformat(),
"content_summary": status_doc["content_summary"],
"content_length": status_doc["content_length"],
"created_at": status_doc["created_at"],
"created_at": status_doc["created_at"],
}
}
)
@@ -599,22 +604,24 @@ class LightRAG:
self.tiktoken_model_name,
)
}
# Ensure chunk insertion and graph processing happen sequentially, not in parallel
# Ensure chunk insertion and graph processing happen sequentially, not in parallel
await self._process_entity_relation_graph(chunks)
await self.chunks_vdb.upsert(chunks)
# Check if document already processed the doc
if doc_id not in full_docs_processed_doc_ids:
tasks[doc_id].append(
self.full_docs.upsert({doc_id: {"content": status_doc["content"]}})
self.full_docs.upsert(
{doc_id: {"content": status_doc["content"]}}
)
)
# Check if chunks already processed the doc
if doc_id not in text_chunks_processed_doc_ids:
tasks[doc_id].append(self.text_chunks.upsert(chunks))
# Process document (text chunks and full docs) in parallel
# Process document (text chunks and full docs) in parallel
for doc_id, task in tasks.items():
try:
await asyncio.gather(*task)
@@ -630,9 +637,7 @@ class LightRAG:
await self._insert_done()
except Exception as e:
logger.error(
f"Failed to process document {doc_id}: {str(e)}"
)
logger.error(f"Failed to process document {doc_id}: {str(e)}")
await self.doc_status.upsert(
{
doc_id: {