cleaned docs

This commit is contained in:
Yannick Stephan
2025-02-09 14:36:49 +01:00
parent 37943a65a3
commit abcdcd5a73
2 changed files with 31 additions and 27 deletions

View File

@@ -121,9 +121,8 @@ async def main():
texts = [x for x in all_text.split("\n") if x] texts = [x for x in all_text.split("\n") if x]
# New mode use pipeline # New mode use pipeline
await rag.apipeline_process_documents(texts) await rag.apipeline_enqueue_documents(texts)
await rag.apipeline_process_chunks() await rag.apipeline_process_enqueue_documents()
await rag.apipeline_process_extract_graph()
# Old method use ainsert # Old method use ainsert
# await rag.ainsert(texts) # await rag.ainsert(texts)

View File

@@ -395,7 +395,9 @@ class LightRAG:
split_by_character is None, this parameter is ignored. split_by_character is None, this parameter is ignored.
""" """
await self.apipeline_process_documents(string_or_strings) await self.apipeline_process_documents(string_or_strings)
await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only) await self.apipeline_process_enqueue_documents(
split_by_character, split_by_character_only
)
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]): def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
loop = always_get_an_event_loop() loop = always_get_an_event_loop()
@@ -551,20 +553,23 @@ class LightRAG:
return return
# Get allready processed documents (text chunks and full docs) # Get allready processed documents (text chunks and full docs)
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(pending_doc_ids) text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(
pending_doc_ids
)
full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids) full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids)
# 2. split docs into chunks, insert chunks, update doc status # 2. split docs into chunks, insert chunks, update doc status
batch_size = self.addon_params.get("insert_batch_size", 10) batch_size = self.addon_params.get("insert_batch_size", 10)
batch_docs_list = [ batch_docs_list = [
pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size) pending_doc_ids[i : i + batch_size]
for i in range(0, len(pending_doc_ids), batch_size)
] ]
# 3. iterate over batches # 3. iterate over batches
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {} tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
for batch_idx, doc_ids in tqdm_async( for batch_idx, doc_ids in tqdm_async(
enumerate(batch_docs_list), enumerate(batch_docs_list),
desc=f"Process Batches", desc="Process Batches",
): ):
# 4. iterate over batch # 4. iterate over batch
for doc_id in tqdm_async( for doc_id in tqdm_async(
@@ -607,7 +612,9 @@ class LightRAG:
# Check if document already processed the doc # Check if document already processed the doc
if doc_id not in full_docs_processed_doc_ids: if doc_id not in full_docs_processed_doc_ids:
tasks[doc_id].append( tasks[doc_id].append(
self.full_docs.upsert({doc_id: {"content": status_doc["content"]}}) self.full_docs.upsert(
{doc_id: {"content": status_doc["content"]}}
)
) )
# Check if chunks already processed the doc # Check if chunks already processed the doc
@@ -630,9 +637,7 @@ class LightRAG:
await self._insert_done() await self._insert_done()
except Exception as e: except Exception as e:
logger.error( logger.error(f"Failed to process document {doc_id}: {str(e)}")
f"Failed to process document {doc_id}: {str(e)}"
)
await self.doc_status.upsert( await self.doc_status.upsert(
{ {
doc_id: { doc_id: {