Upgraded document loading engine
This commit is contained in:
@@ -237,7 +237,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
if global_args["main_args"].document_loading_tool == "DOCLING":
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
pm.install("docling")
|
pm.install("docling")
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
@@ -256,7 +256,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|||||||
for page in reader.pages:
|
for page in reader.pages:
|
||||||
content += page.extract_text() + "\n"
|
content += page.extract_text() + "\n"
|
||||||
case ".docx":
|
case ".docx":
|
||||||
if global_args["main_args"].document_loading_tool == "DOCLING":
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
pm.install("docling")
|
pm.install("docling")
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
@@ -276,7 +276,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|||||||
[paragraph.text for paragraph in doc.paragraphs]
|
[paragraph.text for paragraph in doc.paragraphs]
|
||||||
)
|
)
|
||||||
case ".pptx":
|
case ".pptx":
|
||||||
if global_args["main_args"].document_loading_tool == "DOCLING":
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
pm.install("docling")
|
pm.install("docling")
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
@@ -297,7 +297,7 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|||||||
if hasattr(shape, "text"):
|
if hasattr(shape, "text"):
|
||||||
content += shape.text + "\n"
|
content += shape.text + "\n"
|
||||||
case ".xlsx":
|
case ".xlsx":
|
||||||
if global_args["main_args"].document_loading_tool == "DOCLING":
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
if not pm.is_installed("docling"): # type: ignore
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
pm.install("docling")
|
pm.install("docling")
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
@@ -344,7 +344,7 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
|
|||||||
args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
|
args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
|
||||||
|
|
||||||
# Select Document loading tool
|
# Select Document loading tool
|
||||||
args.document_loading_tool = get_env_value("DOCUMENT_LOADING_TOOL", "DOCLING")
|
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DOCLING")
|
||||||
|
|
||||||
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
|
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user