import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import gradio as gr # Set model name and path model_name = "ybelkada/falcon-7b-sharded-bf16" fine_tuned_model = "mounseflit/falcon-7b-marrakech" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Load base model (CPU-only, no quantization) base_model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", # Auto device map for CPU offload_folder="offload", # Offload large parts of the model to disk to save memory offload_state_dict=True # Enable state dict offloading to reduce memory usage ) # Load the fine-tuned LoRA model on top of the base model model = PeftModel.from_pretrained(base_model, fine_tuned_model) # Ensure the model is in evaluation mode model.eval() # Function to generate text def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt", max_length=50, truncation=True).to("cpu") # Reduce input length with torch.no_grad(): outputs = model.generate(**inputs, max_length=100) # Reduce output length return tokenizer.decode(outputs[0], skip_special_tokens=True) # Create Gradio interface iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite (CPU)") # Launch the app iface.launch()