Spaces:
Runtime error
Runtime error
File size: 2,453 Bytes
9d2f477 6acb880 9d2f477 e6db8ab 9d2f477 e6db8ab 9d2f477 6acb880 9d2f477 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# app.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch
from typing import Optional
import os
os.environ['HF_HOME'] = '/app/cache'
app = FastAPI(title="Gemma Script Generator API")
hf_token = os.getenv('HF_TOKEN')
# Load model and tokenizer
MODEL_NAME = "Sidharthan/gemma2_scripter"
try:
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
use_auth_token = hf_token
)
model = AutoPeftModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto", # Will use CPU if GPU not available
trust_remote_code=True,
cache_dir = '/app/cache'
#load_in_4bit=True
)
except Exception as e:
print(f"Error loading model: {str(e)}")
raise
class GenerationRequest(BaseModel):
message: str
max_length: Optional[int] = 512
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.95
top_k: Optional[int] = 50
repetition_penalty: Optional[float] = 1.2
class GenerationResponse(BaseModel):
generated_text: str
@app.post("/generate", response_model=GenerationResponse)
async def generate_script(request: GenerationRequest):
try:
# Format prompt
prompt = request.message
# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Generate
outputs = model.generate(
**inputs,
max_length=request.max_length,
do_sample=True,
temperature=request.temperature,
top_p=request.top_p,
top_k=request.top_k,
repetition_penalty=request.repetition_penalty,
num_return_sequences=1,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
# Decode output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return GenerationResponse(generated_text=generated_text)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
return {"status": "healthy"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860) |