From 226f6f3d87febd3041017d3d0299a00138ce8832 Mon Sep 17 00:00:00 2001
From: tackhwa <tackhwawong@hotmail.com>
Date: Sat, 26 Oct 2024 02:20:23 +0800
Subject: [PATCH] fix hf output bug

---
 lightrag/llm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lightrag/llm.py b/lightrag/llm.py
index 4dcf535c..692937fb 100644
--- a/lightrag/llm.py
+++ b/lightrag/llm.py
@@ -266,10 +266,11 @@ async def hf_model_if_cache(
     input_ids = hf_tokenizer(
         input_prompt, return_tensors="pt", padding=True, truncation=True
     ).to("cuda")
+    inputs = {k: v.to(hf_model.device) for k, v in input_ids.items()}
     output = hf_model.generate(
         **input_ids, max_new_tokens=200, num_return_sequences=1, early_stopping=True
     )
-    response_text = hf_tokenizer.decode(output[0], skip_special_tokens=True)
+    response_text = hf_tokenizer.decode(output[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
     if hashing_kv is not None:
         await hashing_kv.upsert({args_hash: {"return": response_text, "model": model}})
     return response_text