import subprocess import sys import os def install_package(package_name): subprocess.run([sys.executable, "-m", "pip", "install", package_name], check=True) try: import torch except ImportError: print("Torch n'est pas installé. Installation de torch...") install_package("torch") import torch try: from transformers import ( AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, ) except ImportError: print("Transformers n'est pas installé. Installation de transformers...") install_package("transformers") from transformers import ( AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, ) subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) import gradio as gr from threading import Thread token = os.getenv("HF_TOKEN") if not token: raise ValueError("Le token d'authentification HF_TOKEN n'est pas défini.") model = AutoModelForCausalLM.from_pretrained( "marua15/phi3-fine-tune", token=token, trust_remote_code=True, ) tok = AutoTokenizer.from_pretrained("marua15/phi3-fine-tune", token=token) terminators = [tok.eos_token_id] if torch.cuda.is_available(): device = torch.device("cuda") print(f"Using GPU: {torch.cuda.get_device_name(device)}") else: device = torch.device("cpu") print("Using CPU") model = model.to(device) def chat(message, history, temperature, do_sample, max_new_tokens): chat = [{"role": "user", "content": item[0]} for item in history] chat.extend({"role": "assistant", "content": item[1]} for item in history if item[1]) chat.append({"role": "user", "content": message}) messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) model_inputs = tok([messages], return_tensors="pt").to(device) streamer = TextIteratorStreamer(tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = { "input_ids": model_inputs.input_ids, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": do_sample, "temperature": temperature, "eos_token_id": terminators[0], } t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text yield partial_text demo = gr.ChatInterface( fn=chat, examples=[["Generate a story about apples"]], additional_inputs_accordion=gr.Accordion( label="⚙️ Parameters", open=False, render=False ), additional_inputs=[ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature"), gr.Checkbox(label="Sampling", value=True), gr.Slider(minimum=128, maximum=4096, step=1, value=512, label="Max new tokens"), ], stop_btn="Stop Generation", title="Chat With Phi3", description="Now Running [marua15/phi3-fine-tune](https://huggingface.co/marua15/phi3-fine-tune)", ) if __name__ == "__main__": demo.launch()