diff --git a/lightrag/llm.py b/lightrag/llm.py index eaaa2b75..bb0d6063 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -282,10 +282,11 @@ async def hf_model_if_cache( input_ids = hf_tokenizer( input_prompt, return_tensors="pt", padding=True, truncation=True ).to("cuda") + inputs = {k: v.to(hf_model.device) for k, v in input_ids.items()} output = hf_model.generate( - **input_ids, max_new_tokens=200, num_return_sequences=1, early_stopping=True + **input_ids, max_new_tokens=512, num_return_sequences=1, early_stopping=True ) - response_text = hf_tokenizer.decode(output[0], skip_special_tokens=True) + response_text = hf_tokenizer.decode(output[0][len(inputs["input_ids"][0]):], skip_special_tokens=True) if hashing_kv is not None: await hashing_kv.upsert({args_hash: {"return": response_text, "model": model}}) return response_text