|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
from typing import Dict |
|
import os |
|
|
|
def get_model(): |
|
model_id = "mistralai/Mistral-7B-Instruct-v0.2" |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.set_default_device('cuda') |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
model = model.cuda() |
|
|
|
return model, tokenizer |
|
|
|
|
|
model, tokenizer = get_model() |
|
|
|
def generate(text: str, params: Dict) -> Dict: |
|
try: |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
print(f"Using device: {device}") |
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
print(f"Input device: {inputs['input_ids'].device}") |
|
print(f"Model device: {next(model.parameters()).device}") |
|
|
|
|
|
with torch.cuda.device(device): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=params.get('max_new_tokens', 500), |
|
temperature=params.get('temperature', 0.7), |
|
top_p=params.get('top_p', 0.95), |
|
top_k=params.get('top_k', 50), |
|
pad_token_id=tokenizer.pad_token_id, |
|
eos_token_id=tokenizer.eos_token_id, |
|
) |
|
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return {"generated_text": response} |
|
|
|
except Exception as e: |
|
print(f"Error in generation: {str(e)}") |
|
|
|
print(f"CUDA available: {torch.cuda.is_available()}") |
|
if torch.cuda.is_available(): |
|
print(f"Current CUDA device: {torch.cuda.current_device()}") |
|
print(f"Device count: {torch.cuda.device_count()}") |
|
raise e |
|
|
|
def inference(inputs: Dict) -> Dict: |
|
prompt = inputs.get("inputs", "") |
|
params = inputs.get("parameters", {}) |
|
return generate(prompt, params) |