This commit is contained in:
zrguo
2025-03-03 18:33:42 +08:00
parent 887388c317
commit 1611400854
41 changed files with 1390 additions and 1301 deletions

View File

@@ -1,5 +1,9 @@
import os
import asyncio
import nest_asyncio
nest_asyncio.apply()
from lightrag import LightRAG, QueryParam
from lightrag.llm import (
openai_complete_if_cache,
@@ -7,10 +11,12 @@ from lightrag.llm import (
)
from lightrag.utils import EmbeddingFunc
import numpy as np
from lightrag.kg.shared_storage import initialize_pipeline_status
# for custom llm_model_func
from lightrag.utils import locate_json_string_body_from_string
WORKING_DIR = "./dickens"
if not os.path.exists(WORKING_DIR):
@@ -91,42 +97,37 @@ async def test_funcs():
# asyncio.run(test_funcs())
async def initialize_rag():
embedding_dimension = await get_embedding_dim()
print(f"Detected embedding dimension: {embedding_dimension}")
# lightRAG class during indexing
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
# llm_model_name="meta/llama3-70b-instruct", #un comment if
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=512, # maximum token size, somehow it's still exceed maximum number of token
# so truncate (trunc) parameter on embedding_func will handle it and try to examine the tokenizer used in LightRAG
# so you can adjust to be able to fit the NVIDIA model (future work)
func=indexing_embedding_func,
),
)
await rag.initialize_storages()
await initialize_pipeline_status()
return rag
async def main():
try:
embedding_dimension = await get_embedding_dim()
print(f"Detected embedding dimension: {embedding_dimension}")
# lightRAG class during indexing
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
# llm_model_name="meta/llama3-70b-instruct", #un comment if
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=512, # maximum token size, somehow it's still exceed maximum number of token
# so truncate (trunc) parameter on embedding_func will handle it and try to examine the tokenizer used in LightRAG
# so you can adjust to be able to fit the NVIDIA model (future work)
func=indexing_embedding_func,
),
)
# Initialize RAG instance
rag = asyncio.run(initialize_rag())
# reading file
with open("./book.txt", "r", encoding="utf-8") as f:
await rag.ainsert(f.read())
# redefine rag to change embedding into query type
rag = LightRAG(
working_dir=WORKING_DIR,
llm_model_func=llm_model_func,
# llm_model_name="meta/llama3-70b-instruct", #un comment if
embedding_func=EmbeddingFunc(
embedding_dim=embedding_dimension,
max_token_size=512,
func=query_embedding_func,
),
)
# Perform naive search
print("==============Naive===============")
print(