cleaned docs
This commit is contained in:
@@ -121,9 +121,8 @@ async def main():
|
|||||||
texts = [x for x in all_text.split("\n") if x]
|
texts = [x for x in all_text.split("\n") if x]
|
||||||
|
|
||||||
# New mode use pipeline
|
# New mode use pipeline
|
||||||
await rag.apipeline_process_documents(texts)
|
await rag.apipeline_enqueue_documents(texts)
|
||||||
await rag.apipeline_process_chunks()
|
await rag.apipeline_process_enqueue_documents()
|
||||||
await rag.apipeline_process_extract_graph()
|
|
||||||
|
|
||||||
# Old method use ainsert
|
# Old method use ainsert
|
||||||
# await rag.ainsert(texts)
|
# await rag.ainsert(texts)
|
||||||
|
@@ -395,7 +395,9 @@ class LightRAG:
|
|||||||
split_by_character is None, this parameter is ignored.
|
split_by_character is None, this parameter is ignored.
|
||||||
"""
|
"""
|
||||||
await self.apipeline_process_documents(string_or_strings)
|
await self.apipeline_process_documents(string_or_strings)
|
||||||
await self.apipeline_process_enqueue_documents(split_by_character, split_by_character_only)
|
await self.apipeline_process_enqueue_documents(
|
||||||
|
split_by_character, split_by_character_only
|
||||||
|
)
|
||||||
|
|
||||||
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
|
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
|
||||||
loop = always_get_an_event_loop()
|
loop = always_get_an_event_loop()
|
||||||
@@ -551,20 +553,23 @@ class LightRAG:
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Get allready processed documents (text chunks and full docs)
|
# Get allready processed documents (text chunks and full docs)
|
||||||
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(pending_doc_ids)
|
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(
|
||||||
|
pending_doc_ids
|
||||||
|
)
|
||||||
full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids)
|
full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids)
|
||||||
|
|
||||||
# 2. split docs into chunks, insert chunks, update doc status
|
# 2. split docs into chunks, insert chunks, update doc status
|
||||||
batch_size = self.addon_params.get("insert_batch_size", 10)
|
batch_size = self.addon_params.get("insert_batch_size", 10)
|
||||||
batch_docs_list = [
|
batch_docs_list = [
|
||||||
pending_doc_ids[i : i + batch_size] for i in range(0, len(pending_doc_ids), batch_size)
|
pending_doc_ids[i : i + batch_size]
|
||||||
|
for i in range(0, len(pending_doc_ids), batch_size)
|
||||||
]
|
]
|
||||||
|
|
||||||
# 3. iterate over batches
|
# 3. iterate over batches
|
||||||
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
|
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
|
||||||
for batch_idx, doc_ids in tqdm_async(
|
for batch_idx, doc_ids in tqdm_async(
|
||||||
enumerate(batch_docs_list),
|
enumerate(batch_docs_list),
|
||||||
desc=f"Process Batches",
|
desc="Process Batches",
|
||||||
):
|
):
|
||||||
# 4. iterate over batch
|
# 4. iterate over batch
|
||||||
for doc_id in tqdm_async(
|
for doc_id in tqdm_async(
|
||||||
@@ -607,7 +612,9 @@ class LightRAG:
|
|||||||
# Check if document already processed the doc
|
# Check if document already processed the doc
|
||||||
if doc_id not in full_docs_processed_doc_ids:
|
if doc_id not in full_docs_processed_doc_ids:
|
||||||
tasks[doc_id].append(
|
tasks[doc_id].append(
|
||||||
self.full_docs.upsert({doc_id: {"content": status_doc["content"]}})
|
self.full_docs.upsert(
|
||||||
|
{doc_id: {"content": status_doc["content"]}}
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if chunks already processed the doc
|
# Check if chunks already processed the doc
|
||||||
@@ -630,9 +637,7 @@ class LightRAG:
|
|||||||
await self._insert_done()
|
await self._insert_done()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(f"Failed to process document {doc_id}: {str(e)}")
|
||||||
f"Failed to process document {doc_id}: {str(e)}"
|
|
||||||
)
|
|
||||||
await self.doc_status.upsert(
|
await self.doc_status.upsert(
|
||||||
{
|
{
|
||||||
doc_id: {
|
doc_id: {
|
||||||
|
Reference in New Issue
Block a user