From 0b87e4649f50c2d60690bac569dba990ce1519fa Mon Sep 17 00:00:00 2001 From: MRX760 Date: Tue, 3 Dec 2024 17:15:10 +0700 Subject: [PATCH] added nvidia text-embedding API and example of using nvidia API llm and text-embedding --- examples/lightrag_nvidia_demo.py | 159 +++++++++++++++++++++++++++++++ lightrag/llm.py | 40 ++++++++ 2 files changed, 199 insertions(+) create mode 100644 examples/lightrag_nvidia_demo.py diff --git a/examples/lightrag_nvidia_demo.py b/examples/lightrag_nvidia_demo.py new file mode 100644 index 00000000..10d43c42 --- /dev/null +++ b/examples/lightrag_nvidia_demo.py @@ -0,0 +1,159 @@ +import os +import asyncio +from lightrag import LightRAG, QueryParam +from lightrag.llm import openai_complete_if_cache, nvidia_openai_embedding, nvidia_openai_complete +from lightrag.utils import EmbeddingFunc +import numpy as np + +#for custom llm_model_func +from lightrag.utils import locate_json_string_body_from_string + +WORKING_DIR = "./dickens" + +if not os.path.exists(WORKING_DIR): + os.mkdir(WORKING_DIR) + +#some method to use your API key (choose one) +# NVIDIA_OPENAI_API_KEY = os.getenv("NVIDIA_OPENAI_API_KEY") +NVIDIA_OPENAI_API_KEY = "nvapi-xxxx" #your api key + +# using pre-defined function for nvidia LLM API. OpenAI compatible +# llm_model_func = nvidia_openai_complete + +#If you trying to make custom llm_model_func to use llm model on NVIDIA API like other example: +async def llm_model_func( + prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs +) -> str: + result = await openai_complete_if_cache( + "nvidia/llama-3.1-nemotron-70b-instruct", + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=NVIDIA_OPENAI_API_KEY, + base_url="https://integrate.api.nvidia.com/v1", + **kwargs, + ) + if keyword_extraction: + return locate_json_string_body_from_string(result) + return result + +#custom embedding +nvidia_embed_model = "nvidia/nv-embedqa-e5-v5" +async def indexing_embedding_func(texts: list[str]) -> np.ndarray: + return await nvidia_openai_embedding( + texts, + model = nvidia_embed_model, #maximum 512 token + # model="nvidia/llama-3.2-nv-embedqa-1b-v1", + api_key=NVIDIA_OPENAI_API_KEY, + base_url="https://integrate.api.nvidia.com/v1", + input_type = "passage", + trunc = "END", #handling on server side if input token is longer than maximum token + encode = "float" + ) + +async def query_embedding_func(texts: list[str]) -> np.ndarray: + return await nvidia_openai_embedding( + texts, + model = nvidia_embed_model, #maximum 512 token + # model="nvidia/llama-3.2-nv-embedqa-1b-v1", + api_key=NVIDIA_OPENAI_API_KEY, + base_url="https://integrate.api.nvidia.com/v1", + input_type = "query", + trunc = "END", #handling on server side if input token is longer than maximum token + encode = "float" + ) + +#dimension are same +async def get_embedding_dim(): + test_text = ["This is a test sentence."] + embedding = await indexing_embedding_func(test_text) + embedding_dim = embedding.shape[1] + return embedding_dim + + +# function test +async def test_funcs(): + result = await llm_model_func("How are you?") + print("llm_model_func: ", result) + + result = await indexing_embedding_func(["How are you?"]) + print("embedding_func: ", result) + + +# asyncio.run(test_funcs()) + + +async def main(): + try: + embedding_dimension = await get_embedding_dim() + print(f"Detected embedding dimension: {embedding_dimension}") + + #lightRAG class during indexing + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + # llm_model_name="meta/llama3-70b-instruct", #un comment if + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=512, #maximum token size, somehow it's still exceed maximum number of token + #so truncate (trunc) parameter on embedding_func will handle it and try to examine the tokenizer used in LightRAG + #so you can adjust to be able to fit the NVIDIA model (future work) + func=indexing_embedding_func, + ), + ) + + #reading file + with open("./book.txt", "r", encoding="utf-8") as f: + await rag.ainsert(f.read()) + + #redefine rag to change embedding into query type + rag = LightRAG( + working_dir=WORKING_DIR, + llm_model_func=llm_model_func, + # llm_model_name="meta/llama3-70b-instruct", #un comment if + embedding_func=EmbeddingFunc( + embedding_dim=embedding_dimension, + max_token_size=512, + func=query_embedding_func, + ), + ) + + # Perform naive search + print("==============Naive===============") + print( + await rag.aquery( + "What are the top themes in this story?", param=QueryParam(mode="naive") + ) + ) + + # Perform local search + print("==============local===============") + print( + await rag.aquery( + "What are the top themes in this story?", param=QueryParam(mode="local") + ) + ) + + # Perform global search + print("==============global===============") + print( + await rag.aquery( + "What are the top themes in this story?", + param=QueryParam(mode="global"), + ) + ) + + # Perform hybrid search + print("==============hybrid===============") + print( + await rag.aquery( + "What are the top themes in this story?", + param=QueryParam(mode="hybrid"), + ) + ) + except Exception as e: + print(f"An error occurred: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/lightrag/llm.py b/lightrag/llm.py index 7dc8b886..b4dcb044 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -502,6 +502,20 @@ async def gpt_4o_mini_complete( **kwargs, ) +async def nvidia_openai_complete( + prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs +) -> str: + result = await openai_complete_if_cache( + "nvidia/llama-3.1-nemotron-70b-instruct", #context length 128k + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + base_url="https://integrate.api.nvidia.com/v1", + **kwargs, + ) + if keyword_extraction: # TODO: use JSON API + return locate_json_string_body_from_string(result) + return result async def azure_openai_complete( prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs @@ -588,6 +602,32 @@ async def openai_embedding( return np.array([dp.embedding for dp in response.data]) +@wrap_embedding_func_with_attrs(embedding_dim=2048, max_token_size=512) +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)), +) +async def nvidia_openai_embedding( + texts: list[str], + model: str = "nvidia/llama-3.2-nv-embedqa-1b-v1", #refer to https://build.nvidia.com/nim?filters=usecase%3Ausecase_text_to_embedding + base_url: str = "https://integrate.api.nvidia.com/v1", + api_key: str = None, + input_type: str = "passage", #query for retrieval, passage for embedding + trunc: str = "NONE", #NONE or START or END + encode: str = "float" #float or base64 +) -> np.ndarray: + if api_key: + os.environ["OPENAI_API_KEY"] = api_key + + openai_async_client = ( + AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url) + ) + response = await openai_async_client.embeddings.create( + model=model, input=texts, encoding_format=encode, extra_body={"input_type": input_type, "truncate": trunc} + ) + return np.array([dp.embedding for dp in response.data]) + @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8191) @retry( stop=stop_after_attempt(3),