Merge pull request #1011 from ParisNeo/main
Added docling option to load files
This commit is contained in:
@@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator
|
|||||||
|
|
||||||
from lightrag import LightRAG
|
from lightrag import LightRAG
|
||||||
from lightrag.base import DocProcessingStatus, DocStatus
|
from lightrag.base import DocProcessingStatus, DocStatus
|
||||||
from ..utils_api import get_api_key_dependency, get_auth_dependency
|
from lightrag.api.utils_api import (
|
||||||
|
get_api_key_dependency,
|
||||||
|
global_args,
|
||||||
|
get_auth_dependency,
|
||||||
|
)
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
prefix="/documents",
|
prefix="/documents",
|
||||||
@@ -240,54 +244,93 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
if not pm.is_installed("pypdf2"): # type: ignore
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
pm.install("pypdf2")
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
from PyPDF2 import PdfReader # type: ignore
|
pm.install("docling")
|
||||||
from io import BytesIO
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
pdf_file = BytesIO(file)
|
converter = DocumentConverter()
|
||||||
reader = PdfReader(pdf_file)
|
result = converter.convert(file_path)
|
||||||
for page in reader.pages:
|
content = result.document.export_to_markdown()
|
||||||
content += page.extract_text() + "\n"
|
else:
|
||||||
|
if not pm.is_installed("pypdf2"): # type: ignore
|
||||||
|
pm.install("pypdf2")
|
||||||
|
from PyPDF2 import PdfReader # type: ignore
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
pdf_file = BytesIO(file)
|
||||||
|
reader = PdfReader(pdf_file)
|
||||||
|
for page in reader.pages:
|
||||||
|
content += page.extract_text() + "\n"
|
||||||
case ".docx":
|
case ".docx":
|
||||||
if not pm.is_installed("python-docx"): # type: ignore
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
pm.install("docx")
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
from docx import Document # type: ignore
|
pm.install("docling")
|
||||||
from io import BytesIO
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
docx_file = BytesIO(file)
|
converter = DocumentConverter()
|
||||||
doc = Document(docx_file)
|
result = converter.convert(file_path)
|
||||||
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
content = result.document.export_to_markdown()
|
||||||
|
else:
|
||||||
|
if not pm.is_installed("python-docx"): # type: ignore
|
||||||
|
pm.install("docx")
|
||||||
|
from docx import Document # type: ignore
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
docx_file = BytesIO(file)
|
||||||
|
doc = Document(docx_file)
|
||||||
|
content = "\n".join(
|
||||||
|
[paragraph.text for paragraph in doc.paragraphs]
|
||||||
|
)
|
||||||
case ".pptx":
|
case ".pptx":
|
||||||
if not pm.is_installed("python-pptx"): # type: ignore
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
pm.install("pptx")
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
from pptx import Presentation # type: ignore
|
pm.install("docling")
|
||||||
from io import BytesIO
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
pptx_file = BytesIO(file)
|
converter = DocumentConverter()
|
||||||
prs = Presentation(pptx_file)
|
result = converter.convert(file_path)
|
||||||
for slide in prs.slides:
|
content = result.document.export_to_markdown()
|
||||||
for shape in slide.shapes:
|
else:
|
||||||
if hasattr(shape, "text"):
|
if not pm.is_installed("python-pptx"): # type: ignore
|
||||||
content += shape.text + "\n"
|
pm.install("pptx")
|
||||||
|
from pptx import Presentation # type: ignore
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
pptx_file = BytesIO(file)
|
||||||
|
prs = Presentation(pptx_file)
|
||||||
|
for slide in prs.slides:
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text"):
|
||||||
|
content += shape.text + "\n"
|
||||||
case ".xlsx":
|
case ".xlsx":
|
||||||
if not pm.is_installed("openpyxl"): # type: ignore
|
if global_args["main_args"].document_loading_engine == "DOCLING":
|
||||||
pm.install("openpyxl")
|
if not pm.is_installed("docling"): # type: ignore
|
||||||
from openpyxl import load_workbook # type: ignore
|
pm.install("docling")
|
||||||
from io import BytesIO
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
xlsx_file = BytesIO(file)
|
converter = DocumentConverter()
|
||||||
wb = load_workbook(xlsx_file)
|
result = converter.convert(file_path)
|
||||||
for sheet in wb:
|
content = result.document.export_to_markdown()
|
||||||
content += f"Sheet: {sheet.title}\n"
|
else:
|
||||||
for row in sheet.iter_rows(values_only=True):
|
if not pm.is_installed("openpyxl"): # type: ignore
|
||||||
content += (
|
pm.install("openpyxl")
|
||||||
"\t".join(
|
from openpyxl import load_workbook # type: ignore
|
||||||
str(cell) if cell is not None else "" for cell in row
|
from io import BytesIO
|
||||||
|
|
||||||
|
xlsx_file = BytesIO(file)
|
||||||
|
wb = load_workbook(xlsx_file)
|
||||||
|
for sheet in wb:
|
||||||
|
content += f"Sheet: {sheet.title}\n"
|
||||||
|
for row in sheet.iter_rows(values_only=True):
|
||||||
|
content += (
|
||||||
|
"\t".join(
|
||||||
|
str(cell) if cell is not None else ""
|
||||||
|
for cell in row
|
||||||
|
)
|
||||||
|
+ "\n"
|
||||||
)
|
)
|
||||||
+ "\n"
|
content += "\n"
|
||||||
)
|
|
||||||
content += "\n"
|
|
||||||
case _:
|
case _:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Unsupported file type: {file_path.name} (extension {ext})"
|
f"Unsupported file type: {file_path.name} (extension {ext})"
|
||||||
|
@@ -11,7 +11,7 @@ import asyncio
|
|||||||
from ascii_colors import trace_exception
|
from ascii_colors import trace_exception
|
||||||
from lightrag import LightRAG, QueryParam
|
from lightrag import LightRAG, QueryParam
|
||||||
from lightrag.utils import encode_string_by_tiktoken
|
from lightrag.utils import encode_string_by_tiktoken
|
||||||
from ..utils_api import ollama_server_infos
|
from lightrag.api.utils_api import ollama_server_infos
|
||||||
|
|
||||||
|
|
||||||
# query mode according to query prefix (bypass is not LightRAG quer mode)
|
# query mode according to query prefix (bypass is not LightRAG quer mode)
|
||||||
|
@@ -18,6 +18,8 @@ from .auth import auth_handler
|
|||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv(override=True)
|
load_dotenv(override=True)
|
||||||
|
|
||||||
|
global_args = {"main_args": None}
|
||||||
|
|
||||||
|
|
||||||
class OllamaServerInfos:
|
class OllamaServerInfos:
|
||||||
# Constants for emulated Ollama model information
|
# Constants for emulated Ollama model information
|
||||||
@@ -360,8 +362,12 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
|
|||||||
args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
|
args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
|
||||||
args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
|
args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
|
||||||
|
|
||||||
|
# Select Document loading tool (DOCLING, DEFAULT)
|
||||||
|
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
|
||||||
|
|
||||||
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
|
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
|
||||||
|
|
||||||
|
global_args["main_args"] = args
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user