from typing import Any, Dict import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 class EndpointHandler: def __init__(self, path=""): tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( path, return_dict=True, device_map="auto", load_in_8bit=True, torch_dtype=dtype, trust_remote_code=True, ) self.generation_config = model.generation_config self.generation_config.max_new_tokens = 1000 self.generation_config.temperature = 0.7 # Changed from 0 to 0.7 self.generation_config.num_return_sequences = 1 self.generation_config.pad_token_id = tokenizer.eos_token_id self.generation_config.eos_token_id = tokenizer.eos_token_id self.pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer ) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: prompt = data.pop("inputs", data) result = self.pipeline( prompt, max_length=1000, # Added this line to set max_length temperature=0.7, # Added this line to set temperature top_p=0.9, # Added this line to set top_p num_return_sequences=1, # Added this line to set num_return_sequences pad_token_id=self.generation_config.pad_token_id, eos_token_id=self.generation_config.eos_token_id, return_full_text=True # Added this line to return full text ) return {"generated_text": result}