feat: add delete method for mongo storage implement

2025-05-22 04:41:52 +08:00
15 changed files with 303 additions and 239 deletions
--- a/.gitea/workflows/build.yaml
+++ b/.gitea/workflows/build.yaml
@@ -1,29 +0,0 @@
 name: Build and Push Docker Image
 on:
  push:
    branches:
      - main
      - build
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Registry
        uses: docker/login-action@v3
        with:
          registry: docker.sunxinao.cn
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - name: Build and Push Docker Image
        uses: docker/build-push-action@v5
        with:
          context: .
          file: ./Dockerfile
          push: true
          tags: docker.sunxinao.cn/gardel/lightrag:latest
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,61 @@
 name: Bug Report
 description: File a bug report
 title: "[Bug]:"
 labels: ["bug", "triage"]
 body:
  - type: checkboxes
    id: existingcheck
    attributes:
      label: Do you need to file an issue?
      description: Please help us manage our time by avoiding duplicates and common bugs with the steps below.
      options:
        - label: I have searched the existing issues and this bug is not already filed.
        - label: I believe this is a legitimate bug, not just a question or feature request.
  - type: textarea
    id: description
    attributes:
      label: Describe the bug
      description: A clear and concise description of what the bug is.
      placeholder: What went wrong?
  - type: textarea
    id: reproduce
    attributes:
      label: Steps to reproduce
      description: Steps to reproduce the behavior.
      placeholder: How can we replicate the issue?
  - type: textarea
    id: expected_behavior
    attributes:
      label: Expected Behavior
      description: A clear and concise description of what you expected to happen.
      placeholder: What should have happened?
  - type: textarea
    id: configused
    attributes:
      label: LightRAG Config Used
      description: The LightRAG configuration used for the run.
      placeholder: The settings content or LightRAG configuration
      value: |
        # Paste your config here
  - type: textarea
    id: screenshotslogs
    attributes:
      label: Logs and screenshots
      description: If applicable, add screenshots and logs to help explain your problem.
      placeholder: Add logs and screenshots here
  - type: textarea
    id: additional_information
    attributes:
      label: Additional Information
      description: |
        - LightRAG Version: e.g., v0.1.1
        - Operating System: e.g., Windows 10, Ubuntu 20.04
        - Python Version: e.g., 3.8
        - Related Issues: e.g., #1
        - Any other relevant information.
      value: |
        - LightRAG Version:
        - Operating System:
        - Python Version:
        - Related Issues:
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
 blank_issues_enabled: false
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,26 @@
 name: Feature Request
 description: File a feature request
 labels: ["enhancement"]
 title: "[Feature Request]:"
 body:
  - type: checkboxes
    id: existingcheck
    attributes:
      label: Do you need to file a feature request?
      description: Please help us manage our time by avoiding duplicates and common feature request with the steps below.
      options:
        - label: I have searched the existing feature request and this feature request is not already filed.
        - label: I believe this is a legitimate feature request, not just a question or bug.
  - type: textarea
    id: feature_request_description
    attributes:
      label: Feature Request Description
      description: A clear and concise description of the feature request you would like.
      placeholder: What this feature request add more or improve?
  - type: textarea
    id: additional_context
    attributes:
      label: Additional Context
      description: Add any other context or screenshots about the feature request here.
      placeholder: Any additional information
--- a/.github/ISSUE_TEMPLATE/question.yml
+++ b/.github/ISSUE_TEMPLATE/question.yml
@@ -0,0 +1,26 @@
 name: Question
 description: Ask a general question
 labels: ["question"]
 title: "[Question]:"
 body:
  - type: checkboxes
    id: existingcheck
    attributes:
      label: Do you need to ask a question?
      description: Please help us manage our time by avoiding duplicates and common questions with the steps below.
      options:
        - label: I have searched the existing question and discussions and this question is not already answered.
        - label: I believe this is a legitimate question, not just a bug or feature request.
  - type: textarea
    id: question
    attributes:
      label: Your Question
      description: A clear and concise description of your question.
      placeholder: What is your question?
  - type: textarea
    id: context
    attributes:
      label: Additional Context
      description: Provide any additional context or details that might help us understand your question better.
      placeholder: Add any relevant information here
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
 # To get started with Dependabot version updates, you'll need to specify which
 # package ecosystems to update and where the package manifests are located.
 # Please see the documentation for all configuration options:
 # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 version: 2
 updates:
  - package-ecosystem: "pip" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
      interval: "weekly"
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -0,0 +1,32 @@
 <!--
 Thanks for contributing to LightRAG!
 Please ensure your pull request is ready for review before submitting.
 About this template
 This template helps contributors provide a clear and concise description of their changes. Feel free to adjust it as needed.
 -->
 ## Description
 [Briefly describe the changes made in this pull request.]
 ## Related Issues
 [Reference any related issues or tasks addressed by this pull request.]
 ## Changes Made
 [List the specific changes made in this pull request.]
 ## Checklist
 - [ ] Changes tested locally
 - [ ] Code reviewed
 - [ ] Documentation updated (if necessary)
 - [ ] Unit tests added (if applicable)
 ## Additional Notes
 [Add any additional notes or context for the reviewer(s).]
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -0,0 +1,47 @@
 name: Build and Push Docker Image
 on:
  release:
    types: [published]
  workflow_dispatch:
 permissions:
  contents: read
  packages: write
 jobs:
  build-and-push:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to GitHub Container Registry
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      - name: Extract metadata for Docker
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ghcr.io/${{ github.repository }}
          tags: |
            type=semver,pattern={{version}}
            type=raw,value=latest,enable={{is_default_branch}}
      - name: Build and push Docker image
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: linux/amd64,linux/arm64
          push: true
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
--- a/.github/workflows/linting.yaml
+++ b/.github/workflows/linting.yaml
@@ -0,0 +1,30 @@
 name: Linting and Formatting
 on:
    push:
        branches:
            - main
    pull_request:
        branches:
            - main
 jobs:
    lint-and-format:
        runs-on: ubuntu-latest
        steps:
            - name: Checkout code
              uses: actions/checkout@v2
            - name: Set up Python
              uses: actions/setup-python@v2
              with:
                python-version: '3.x'
            - name: Install dependencies
              run: |
                python -m pip install --upgrade pip
                pip install pre-commit
            - name: Run pre-commit
              run: pre-commit run --all-files --show-diff-on-failure
--- a/.github/workflows/pypi-publish.yml
+++ b/.github/workflows/pypi-publish.yml
@@ -0,0 +1,52 @@
 name: Upload LightRAG-hku Package
 on:
  release:
    types: [published]
 permissions:
  contents: read
 jobs:
  release-build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.x"
      - name: Build release distributions
        run: |
          python -m pip install build
          python -m build
      - name: Upload distributions
        uses: actions/upload-artifact@v4
        with:
          name: release-dists
          path: dist/
  pypi-publish:
    runs-on: ubuntu-latest
    needs:
      - release-build
    permissions:
      id-token: write
    environment:
      name: pypi
    steps:
      - name: Retrieve release distributions
        uses: actions/download-artifact@v4
        with:
          name: release-dists
          path: dist/
      - name: Publish release distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          packages-dir: dist/
--- a/examples/unofficial-sample/lightrag_llamaindex_litellm_demo.py
+++ b/examples/unofficial-sample/lightrag_llamaindex_litellm_demo.py
@@ -53,6 +53,7 @@ async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwar
            prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
            **kwargs,
        )
        return response
    except Exception as e:
--- a/examples/unofficial-sample/lightrag_llamaindex_litellm_opik_demo.py
+++ b/examples/unofficial-sample/lightrag_llamaindex_litellm_opik_demo.py
@@ -1,155 +0,0 @@
 import os
 from lightrag import LightRAG, QueryParam
 from lightrag.llm.llama_index_impl import (
    llama_index_complete_if_cache,
    llama_index_embed,
 )
 from lightrag.utils import EmbeddingFunc
 from llama_index.llms.litellm import LiteLLM
 from llama_index.embeddings.litellm import LiteLLMEmbedding
 import asyncio
 import nest_asyncio
 nest_asyncio.apply()
 from lightrag.kg.shared_storage import initialize_pipeline_status
 # Configure working directory
 WORKING_DIR = "./index_default"
 print(f"WORKING_DIR: {WORKING_DIR}")
 # Model configuration
 LLM_MODEL = os.environ.get("LLM_MODEL", "gemma-3-4b")
 print(f"LLM_MODEL: {LLM_MODEL}")
 EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "arctic-embed")
 print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
 EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192))
 print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}")
 # LiteLLM configuration
 LITELLM_URL = os.environ.get("LITELLM_URL", "http://localhost:4000")
 print(f"LITELLM_URL: {LITELLM_URL}")
 LITELLM_KEY = os.environ.get("LITELLM_KEY", "sk-4JdvGFKqSA3S0k_5p0xufw")
 if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)
 # Initialize LLM function
 async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
    try:
        # Initialize LiteLLM if not in kwargs
        if "llm_instance" not in kwargs:
            llm_instance = LiteLLM(
                model=f"openai/{LLM_MODEL}",  # Format: "provider/model_name"
                api_base=LITELLM_URL,
                api_key=LITELLM_KEY,
                temperature=0.7,
            )
            kwargs["llm_instance"] = llm_instance
        chat_kwargs = {}
        chat_kwargs["litellm_params"] = {
            "metadata": {
                "opik": {
                    "project_name": "lightrag_llamaindex_litellm_opik_demo",
                    "tags": ["lightrag", "litellm"],
                }
            }
        }
        response = await llama_index_complete_if_cache(
            kwargs["llm_instance"],
            prompt,
            system_prompt=system_prompt,
            history_messages=history_messages,
            chat_kwargs=chat_kwargs,
        )
        return response
    except Exception as e:
        print(f"LLM request failed: {str(e)}")
        raise
 # Initialize embedding function
 async def embedding_func(texts):
    try:
        embed_model = LiteLLMEmbedding(
            model_name=f"openai/{EMBEDDING_MODEL}",
            api_base=LITELLM_URL,
            api_key=LITELLM_KEY,
        )
        return await llama_index_embed(texts, embed_model=embed_model)
    except Exception as e:
        print(f"Embedding failed: {str(e)}")
        raise
 # Get embedding dimension
 async def get_embedding_dim():
    test_text = ["This is a test sentence."]
    embedding = await embedding_func(test_text)
    embedding_dim = embedding.shape[1]
    print(f"embedding_dim={embedding_dim}")
    return embedding_dim
 async def initialize_rag():
    embedding_dimension = await get_embedding_dim()
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
        embedding_func=EmbeddingFunc(
            embedding_dim=embedding_dimension,
            max_token_size=EMBEDDING_MAX_TOKEN_SIZE,
            func=embedding_func,
        ),
    )
    await rag.initialize_storages()
    await initialize_pipeline_status()
    return rag
 def main():
    # Initialize RAG instance
    rag = asyncio.run(initialize_rag())
    # Insert example text
    with open("./book.txt", "r", encoding="utf-8") as f:
        rag.insert(f.read())
    # Test different query modes
    print("\nNaive Search:")
    print(
        rag.query(
            "What are the top themes in this story?", param=QueryParam(mode="naive")
        )
    )
    print("\nLocal Search:")
    print(
        rag.query(
            "What are the top themes in this story?", param=QueryParam(mode="local")
        )
    )
    print("\nGlobal Search:")
    print(
        rag.query(
            "What are the top themes in this story?", param=QueryParam(mode="global")
        )
    )
    print("\nHybrid Search:")
    print(
        rag.query(
            "What are the top themes in this story?", param=QueryParam(mode="hybrid")
        )
    )
 if __name__ == "__main__":
    main()
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -84,30 +84,22 @@ class InsertTextRequest(BaseModel):
    Attributes:
        text: The text content to be inserted into the RAG system
        file_source: Source of the text (optional)
    """
    text: str = Field(
        min_length=1,
        description="The text to insert",
    )
    file_source: str = Field(default=None, min_length=0, description="File Source")
    @field_validator("text", mode="after")
    @classmethod
-    def strip_text_after(cls, text: str) -> str:
+    def strip_after(cls, text: str) -> str:
        return text.strip()
    @field_validator("file_source", mode="after")
    @classmethod
    def strip_source_after(cls, file_source: str) -> str:
        return file_source.strip()
    class Config:
        json_schema_extra = {
            "example": {
-                "text": "This is a sample text to be inserted into the RAG system.",
+                "text": "This is a sample text to be inserted into the RAG system."
                "file_source": "Source of the text (optional)",
            }
        }
@@ -117,37 +109,25 @@ class InsertTextsRequest(BaseModel):
    Attributes:
        texts: List of text contents to be inserted into the RAG system
        file_sources: Sources of the texts (optional)
    """
    texts: list[str] = Field(
        min_length=1,
        description="The texts to insert",
    )
    file_sources: list[str] = Field(
        default=None, min_length=0, description="Sources of the texts"
    )
    @field_validator("texts", mode="after")
    @classmethod
-    def strip_texts_after(cls, texts: list[str]) -> list[str]:
+    def strip_after(cls, texts: list[str]) -> list[str]:
        return [text.strip() for text in texts]
    @field_validator("file_sources", mode="after")
    @classmethod
    def strip_sources_after(cls, file_sources: list[str]) -> list[str]:
        return [file_source.strip() for file_source in file_sources]
    class Config:
        json_schema_extra = {
            "example": {
                "texts": [
                    "This is the first text to be inserted.",
                    "This is the second text to be inserted.",
-                ],
+                ]
                "file_sources": [
                    "First file source (optional)",
                ],
            }
        }
@@ -676,25 +656,16 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
        logger.error(traceback.format_exc())
-async def pipeline_index_texts(
+async def pipeline_index_texts(rag: LightRAG, texts: List[str]):
    rag: LightRAG, texts: List[str], file_sources: List[str] = None
 ):
    """Index a list of texts
    Args:
        rag: LightRAG instance
        texts: The texts to index
        file_sources: Sources of the texts
    """
    if not texts:
        return
-    if file_sources is not None:
+    await rag.apipeline_enqueue_documents(texts)
        if len(file_sources) != 0 and len(file_sources) != len(texts):
            [
                file_sources.append("unknown_source")
                for _ in range(len(file_sources), len(texts))
            ]
    await rag.apipeline_enqueue_documents(input=texts, file_paths=file_sources)
    await rag.apipeline_process_enqueue_documents()
@@ -845,12 +816,7 @@ def create_document_routes(
            HTTPException: If an error occurs during text processing (500).
        """
        try:
-            background_tasks.add_task(
+            background_tasks.add_task(pipeline_index_texts, rag, [request.text])
                pipeline_index_texts,
                rag,
                [request.text],
                file_sources=[request.file_source],
            )
            return InsertResponse(
                status="success",
                message="Text successfully received. Processing will continue in background.",
@@ -885,12 +851,7 @@ def create_document_routes(
            HTTPException: If an error occurs during text processing (500).
        """
        try:
-            background_tasks.add_task(
+            background_tasks.add_task(pipeline_index_texts, rag, request.texts)
                pipeline_index_texts,
                rag,
                request.texts,
                file_sources=request.file_sources,
            )
            return InsertResponse(
                status="success",
                message="Text successfully received. Processing will continue in background.",
--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@@ -78,10 +78,6 @@ class QueryRequest(BaseModel):
        description="Number of complete conversation turns (user-assistant pairs) to consider in the response context.",
    )
    ids: list[str] | None = Field(
        default=None, description="List of ids to filter the results."
    )
    user_prompt: Optional[str] = Field(
        default=None,
        description="User-provided prompt for the query. If provided, this will be used instead of the default value from prompt template.",
--- a/lightrag/llm/llama_index_impl.py
+++ b/lightrag/llm/llama_index_impl.py
@@ -95,7 +95,7 @@ async def llama_index_complete_if_cache(
    prompt: str,
    system_prompt: Optional[str] = None,
    history_messages: List[dict] = [],
-    chat_kwargs={},
+    **kwargs,
 ) -> str:
    """Complete the prompt using LlamaIndex."""
    try:
@@ -122,9 +122,13 @@ async def llama_index_complete_if_cache(
        # Add current prompt
        formatted_messages.append(ChatMessage(role=MessageRole.USER, content=prompt))
-        response: ChatResponse = await model.achat(
+        # Get LLM instance from kwargs
-            messages=formatted_messages, **chat_kwargs
+        if "llm_instance" not in kwargs:
-        )
+            raise ValueError("llm_instance must be provided in kwargs")
        llm = kwargs["llm_instance"]
        # Get response
        response: ChatResponse = await llm.achat(messages=formatted_messages)
        # In newer versions, the response is in message.content
        content = response.message.content