From b144e0c3b0298243c6673e524df414057a6fb9d5 Mon Sep 17 00:00:00 2001 From: yangdx Date: Fri, 21 Feb 2025 21:07:37 +0800 Subject: [PATCH] Sync modifications from main branch --- lightrag/api/routers/document_routes.py | 67 ++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index c17ccd88..25ca24e4 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -117,6 +117,37 @@ class DocumentManager: ".docx", ".pptx", ".xlsx", + ".rtf", # Rich Text Format + ".odt", # OpenDocument Text + ".tex", # LaTeX + ".epub", # Electronic Publication + ".html", # HyperText Markup Language + ".htm", # HyperText Markup Language + ".csv", # Comma-Separated Values + ".json", # JavaScript Object Notation + ".xml", # eXtensible Markup Language + ".yaml", # YAML Ain't Markup Language + ".yml", # YAML + ".log", # Log files + ".conf", # Configuration files + ".ini", # Initialization files + ".properties", # Java properties files + ".sql", # SQL scripts + ".bat", # Batch files + ".sh", # Shell scripts + ".c", # C source code + ".cpp", # C++ source code + ".py", # Python source code + ".java", # Java source code + ".js", # JavaScript source code + ".ts", # TypeScript source code + ".swift", # Swift source code + ".go", # Go source code + ".rb", # Ruby source code + ".php", # PHP source code + ".css", # Cascading Style Sheets + ".scss", # Sassy CSS + ".less", # LESS CSS ), ): self.input_dir = Path(input_dir) @@ -170,7 +201,41 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool: # Process based on file type match ext: - case ".txt" | ".md": + case ( + ".txt" + | ".md" + | ".html" + | ".htm" + | ".tex" + | ".json" + | ".xml" + | ".yaml" + | ".yml" + | ".rtf" + | ".odt" + | ".epub" + | ".csv" + | ".log" + | ".conf" + | ".ini" + | ".properties" + | ".sql" + | ".bat" + | ".sh" + | ".c" + | ".cpp" + | ".py" + | ".java" + | ".js" + | ".ts" + | ".swift" + | ".go" + | ".rb" + | ".php" + | ".css" + | ".scss" + | ".less" + ): content = file.decode("utf-8") case ".pdf": if not pm.is_installed("pypdf2"):