feat: add delete method for mongo storage implement

2025-05-22 04:41:52 +08:00
15 changed files with 303 additions and 239 deletions
--- a/.gitea/workflows/build.yaml
+++ b/.gitea/workflows/build.yaml
@@ -1,29 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  push:
-    branches:
-      - main
-      - build
-
-jobs:
-  build-and-push:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Registry
-        uses: docker/login-action@v3
-        with:
-          registry: docker.sunxinao.cn
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_PASSWORD }}
-      - name: Build and Push Docker Image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile
-          push: true
-          tags: docker.sunxinao.cn/gardel/lightrag:latest
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,61 @@
+name: Bug Report
+description: File a bug report
+title: "[Bug]:"
+labels: ["bug", "triage"]
+
+body:
+  - type: checkboxes
+    id: existingcheck
+    attributes:
+      label: Do you need to file an issue?
+      description: Please help us manage our time by avoiding duplicates and common bugs with the steps below.
+      options:
+        - label: I have searched the existing issues and this bug is not already filed.
+        - label: I believe this is a legitimate bug, not just a question or feature request.
+  - type: textarea
+    id: description
+    attributes:
+      label: Describe the bug
+      description: A clear and concise description of what the bug is.
+      placeholder: What went wrong?
+  - type: textarea
+    id: reproduce
+    attributes:
+      label: Steps to reproduce
+      description: Steps to reproduce the behavior.
+      placeholder: How can we replicate the issue?
+  - type: textarea
+    id: expected_behavior
+    attributes:
+      label: Expected Behavior
+      description: A clear and concise description of what you expected to happen.
+      placeholder: What should have happened?
+  - type: textarea
+    id: configused
+    attributes:
+      label: LightRAG Config Used
+      description: The LightRAG configuration used for the run.
+      placeholder: The settings content or LightRAG configuration
+      value: |
+        # Paste your config here
+  - type: textarea
+    id: screenshotslogs
+    attributes:
+      label: Logs and screenshots
+      description: If applicable, add screenshots and logs to help explain your problem.
+      placeholder: Add logs and screenshots here
+  - type: textarea
+    id: additional_information
+    attributes:
+      label: Additional Information
+      description: |
+        - LightRAG Version: e.g., v0.1.1
+        - Operating System: e.g., Windows 10, Ubuntu 20.04
+        - Python Version: e.g., 3.8
+        - Related Issues: e.g., #1
+        - Any other relevant information.
+      value: |
+        - LightRAG Version:
+        - Operating System:
+        - Python Version:
+        - Related Issues:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,26 @@
+name: Feature Request
+description: File a feature request
+labels: ["enhancement"]
+title: "[Feature Request]:"
+
+body:
+  - type: checkboxes
+    id: existingcheck
+    attributes:
+      label: Do you need to file a feature request?
+      description: Please help us manage our time by avoiding duplicates and common feature request with the steps below.
+      options:
+        - label: I have searched the existing feature request and this feature request is not already filed.
+        - label: I believe this is a legitimate feature request, not just a question or bug.
+  - type: textarea
+    id: feature_request_description
+    attributes:
+      label: Feature Request Description
+      description: A clear and concise description of the feature request you would like.
+      placeholder: What this feature request add more or improve?
+  - type: textarea
+    id: additional_context
+    attributes:
+      label: Additional Context
+      description: Add any other context or screenshots about the feature request here.
+      placeholder: Any additional information
--- a/.github/ISSUE_TEMPLATE/question.yml
+++ b/.github/ISSUE_TEMPLATE/question.yml
@@ -0,0 +1,26 @@
+name: Question
+description: Ask a general question
+labels: ["question"]
+title: "[Question]:"
+
+body:
+  - type: checkboxes
+    id: existingcheck
+    attributes:
+      label: Do you need to ask a question?
+      description: Please help us manage our time by avoiding duplicates and common questions with the steps below.
+      options:
+        - label: I have searched the existing question and discussions and this question is not already answered.
+        - label: I believe this is a legitimate question, not just a bug or feature request.
+  - type: textarea
+    id: question
+    attributes:
+      label: Your Question
+      description: A clear and concise description of your question.
+      placeholder: What is your question?
+  - type: textarea
+    id: context
+    attributes:
+      label: Additional Context
+      description: Provide any additional context or details that might help us understand your question better.
+      placeholder: Add any relevant information here
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -0,0 +1,32 @@
+<!--
+Thanks for contributing to LightRAG!
+
+Please ensure your pull request is ready for review before submitting.
+
+About this template
+
+This template helps contributors provide a clear and concise description of their changes. Feel free to adjust it as needed.
+-->
+
+## Description
+
+[Briefly describe the changes made in this pull request.]
+
+## Related Issues
+
+[Reference any related issues or tasks addressed by this pull request.]
+
+## Changes Made
+
+[List the specific changes made in this pull request.]
+
+## Checklist
+
+- [ ] Changes tested locally
+- [ ] Code reviewed
+- [ ] Documentation updated (if necessary)
+- [ ] Unit tests added (if applicable)
+
+## Additional Notes
+
+[Add any additional notes or context for the reviewer(s).]
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,47 @@
+name: Build and Push Docker Image
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/${{ github.repository }}
+          tags: |
+            type=semver,pattern={{version}}
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
--- a/.github/workflows/linting.yaml
+++ b/.github/workflows/linting.yaml
@@ -0,0 +1,30 @@
+name: Linting and Formatting
+
+on:
+    push:
+        branches:
+            - main
+    pull_request:
+        branches:
+            - main
+
+jobs:
+    lint-and-format:
+        runs-on: ubuntu-latest
+
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v2
+
+            - name: Set up Python
+              uses: actions/setup-python@v2
+              with:
+                python-version: '3.x'
+
+            - name: Install dependencies
+              run: |
+                python -m pip install --upgrade pip
+                pip install pre-commit
+
+            - name: Run pre-commit
+              run: pre-commit run --all-files --show-diff-on-failure
--- a/.github/workflows/pypi-publish.yml
+++ b/.github/workflows/pypi-publish.yml
@@ -0,0 +1,52 @@
+name: Upload LightRAG-hku Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+
+      - name: Build release distributions
+        run: |
+          python -m pip install build
+          python -m build
+
+      - name: Upload distributions
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      id-token: write
+
+    environment:
+      name: pypi
+
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/
--- a/examples/unofficial-sample/lightrag_llamaindex_litellm_demo.py
+++ b/examples/unofficial-sample/lightrag_llamaindex_litellm_demo.py
@@ -53,6 +53,7 @@ async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwar
            prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
+            **kwargs,
        )
        return response
    except Exception as e:
--- a/examples/unofficial-sample/lightrag_llamaindex_litellm_opik_demo.py
+++ b/examples/unofficial-sample/lightrag_llamaindex_litellm_opik_demo.py
@@ -1,155 +0,0 @@
-import os
-from lightrag import LightRAG, QueryParam
-from lightrag.llm.llama_index_impl import (
-    llama_index_complete_if_cache,
-    llama_index_embed,
-)
-from lightrag.utils import EmbeddingFunc
-from llama_index.llms.litellm import LiteLLM
-from llama_index.embeddings.litellm import LiteLLMEmbedding
-import asyncio
-import nest_asyncio
-
-nest_asyncio.apply()
-
-from lightrag.kg.shared_storage import initialize_pipeline_status
-
-# Configure working directory
-WORKING_DIR = "./index_default"
-print(f"WORKING_DIR: {WORKING_DIR}")
-
-# Model configuration
-LLM_MODEL = os.environ.get("LLM_MODEL", "gemma-3-4b")
-print(f"LLM_MODEL: {LLM_MODEL}")
-EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "arctic-embed")
-print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
-EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192))
-print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}")
-
-# LiteLLM configuration
-LITELLM_URL = os.environ.get("LITELLM_URL", "http://localhost:4000")
-print(f"LITELLM_URL: {LITELLM_URL}")
-LITELLM_KEY = os.environ.get("LITELLM_KEY", "sk-4JdvGFKqSA3S0k_5p0xufw")
-
-if not os.path.exists(WORKING_DIR):
-    os.mkdir(WORKING_DIR)
-
-
-# Initialize LLM function
-async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
-    try:
-        # Initialize LiteLLM if not in kwargs
-        if "llm_instance" not in kwargs:
-            llm_instance = LiteLLM(
-                model=f"openai/{LLM_MODEL}",  # Format: "provider/model_name"
-                api_base=LITELLM_URL,
-                api_key=LITELLM_KEY,
-                temperature=0.7,
-            )
-            kwargs["llm_instance"] = llm_instance
-
-        chat_kwargs = {}
-        chat_kwargs["litellm_params"] = {
-            "metadata": {
-                "opik": {
-                    "project_name": "lightrag_llamaindex_litellm_opik_demo",
-                    "tags": ["lightrag", "litellm"],
-                }
-            }
-        }
-
-        response = await llama_index_complete_if_cache(
-            kwargs["llm_instance"],
-            prompt,
-            system_prompt=system_prompt,
-            history_messages=history_messages,
-            chat_kwargs=chat_kwargs,
-        )
-        return response
-    except Exception as e:
-        print(f"LLM request failed: {str(e)}")
-        raise
-
-
-# Initialize embedding function
-async def embedding_func(texts):
-    try:
-        embed_model = LiteLLMEmbedding(
-            model_name=f"openai/{EMBEDDING_MODEL}",
-            api_base=LITELLM_URL,
-            api_key=LITELLM_KEY,
-        )
-        return await llama_index_embed(texts, embed_model=embed_model)
-    except Exception as e:
-        print(f"Embedding failed: {str(e)}")
-        raise
-
-
-# Get embedding dimension
-async def get_embedding_dim():
-    test_text = ["This is a test sentence."]
-    embedding = await embedding_func(test_text)
-    embedding_dim = embedding.shape[1]
-    print(f"embedding_dim={embedding_dim}")
-    return embedding_dim
-
-
-async def initialize_rag():
-    embedding_dimension = await get_embedding_dim()
-
-    rag = LightRAG(
-        working_dir=WORKING_DIR,
-        llm_model_func=llm_model_func,
-        embedding_func=EmbeddingFunc(
-            embedding_dim=embedding_dimension,
-            max_token_size=EMBEDDING_MAX_TOKEN_SIZE,
-            func=embedding_func,
-        ),
-    )
-
-    await rag.initialize_storages()
-    await initialize_pipeline_status()
-
-    return rag
-
-
-def main():
-    # Initialize RAG instance
-    rag = asyncio.run(initialize_rag())
-
-    # Insert example text
-    with open("./book.txt", "r", encoding="utf-8") as f:
-        rag.insert(f.read())
-
-    # Test different query modes
-    print("\nNaive Search:")
-    print(
-        rag.query(
-            "What are the top themes in this story?", param=QueryParam(mode="naive")
-        )
-    )
-
-    print("\nLocal Search:")
-    print(
-        rag.query(
-            "What are the top themes in this story?", param=QueryParam(mode="local")
-        )
-    )
-
-    print("\nGlobal Search:")
-    print(
-        rag.query(
-            "What are the top themes in this story?", param=QueryParam(mode="global")
-        )
-    )
-
-    print("\nHybrid Search:")
-    print(
-        rag.query(
-            "What are the top themes in this story?", param=QueryParam(mode="hybrid")
-        )
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -84,30 +84,22 @@ class InsertTextRequest(BaseModel):

    Attributes:
        text: The text content to be inserted into the RAG system
-        file_source: Source of the text (optional)
    """

    text: str = Field(
        min_length=1,
        description="The text to insert",
    )
-    file_source: str = Field(default=None, min_length=0, description="File Source")

    @field_validator("text", mode="after")
    @classmethod
-    def strip_text_after(cls, text: str) -> str:
+    def strip_after(cls, text: str) -> str:
        return text.strip()

-    @field_validator("file_source", mode="after")
-    @classmethod
-    def strip_source_after(cls, file_source: str) -> str:
-        return file_source.strip()
-
    class Config:
        json_schema_extra = {
            "example": {
-                "text": "This is a sample text to be inserted into the RAG system.",
-                "file_source": "Source of the text (optional)",
+                "text": "This is a sample text to be inserted into the RAG system."
            }
        }

@@ -117,37 +109,25 @@ class InsertTextsRequest(BaseModel):

    Attributes:
        texts: List of text contents to be inserted into the RAG system
-        file_sources: Sources of the texts (optional)
    """

    texts: list[str] = Field(
        min_length=1,
        description="The texts to insert",
    )
-    file_sources: list[str] = Field(
-        default=None, min_length=0, description="Sources of the texts"
-    )

    @field_validator("texts", mode="after")
    @classmethod
-    def strip_texts_after(cls, texts: list[str]) -> list[str]:
+    def strip_after(cls, texts: list[str]) -> list[str]:
        return [text.strip() for text in texts]

-    @field_validator("file_sources", mode="after")
-    @classmethod
-    def strip_sources_after(cls, file_sources: list[str]) -> list[str]:
-        return [file_source.strip() for file_source in file_sources]
-
    class Config:
        json_schema_extra = {
            "example": {
                "texts": [
                    "This is the first text to be inserted.",
                    "This is the second text to be inserted.",
-                ],
-                "file_sources": [
-                    "First file source (optional)",
-                ],
+                ]
            }
        }

@@ -676,25 +656,16 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
        logger.error(traceback.format_exc())


-async def pipeline_index_texts(
-    rag: LightRAG, texts: List[str], file_sources: List[str] = None
-):
+async def pipeline_index_texts(rag: LightRAG, texts: List[str]):
    """Index a list of texts

    Args:
        rag: LightRAG instance
        texts: The texts to index
-        file_sources: Sources of the texts
    """
    if not texts:
        return
-    if file_sources is not None:
-        if len(file_sources) != 0 and len(file_sources) != len(texts):
-            [
-                file_sources.append("unknown_source")
-                for _ in range(len(file_sources), len(texts))
-            ]
-    await rag.apipeline_enqueue_documents(input=texts, file_paths=file_sources)
+    await rag.apipeline_enqueue_documents(texts)
    await rag.apipeline_process_enqueue_documents()


@@ -845,12 +816,7 @@ def create_document_routes(
            HTTPException: If an error occurs during text processing (500).
        """
        try:
-            background_tasks.add_task(
-                pipeline_index_texts,
-                rag,
-                [request.text],
-                file_sources=[request.file_source],
-            )
+            background_tasks.add_task(pipeline_index_texts, rag, [request.text])
            return InsertResponse(
                status="success",
                message="Text successfully received. Processing will continue in background.",
@@ -885,12 +851,7 @@ def create_document_routes(
            HTTPException: If an error occurs during text processing (500).
        """
        try:
-            background_tasks.add_task(
-                pipeline_index_texts,
-                rag,
-                request.texts,
-                file_sources=request.file_sources,
-            )
+            background_tasks.add_task(pipeline_index_texts, rag, request.texts)
            return InsertResponse(
                status="success",
                message="Text successfully received. Processing will continue in background.",
--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@@ -78,10 +78,6 @@ class QueryRequest(BaseModel):
        description="Number of complete conversation turns (user-assistant pairs) to consider in the response context.",
    )

-    ids: list[str] | None = Field(
-        default=None, description="List of ids to filter the results."
-    )
-
    user_prompt: Optional[str] = Field(
        default=None,
        description="User-provided prompt for the query. If provided, this will be used instead of the default value from prompt template.",
--- a/lightrag/llm/llama_index_impl.py
+++ b/lightrag/llm/llama_index_impl.py
@@ -95,7 +95,7 @@ async def llama_index_complete_if_cache(
    prompt: str,
    system_prompt: Optional[str] = None,
    history_messages: List[dict] = [],
-    chat_kwargs={},
+    **kwargs,
 ) -> str:
    """Complete the prompt using LlamaIndex."""
    try:
@@ -122,9 +122,13 @@ async def llama_index_complete_if_cache(
        # Add current prompt
        formatted_messages.append(ChatMessage(role=MessageRole.USER, content=prompt))

-        response: ChatResponse = await model.achat(
-            messages=formatted_messages, **chat_kwargs
-        )
+        # Get LLM instance from kwargs
+        if "llm_instance" not in kwargs:
+            raise ValueError("llm_instance must be provided in kwargs")
+        llm = kwargs["llm_instance"]
+
+        # Get response
+        response: ChatResponse = await llm.achat(messages=formatted_messages)

        # In newer versions, the response is in message.content
        content = response.message.content