import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed from transformers import pipeline import torch import json @st.cache(allow_output_mutation=True) def load_tokenizer(model_ckpt): return AutoTokenizer.from_pretrained(model_ckpt) @st.cache(allow_output_mutation=True) def load_model(model_ckpt): model = AutoModelForCausalLM.from_pretrained(model_ckpt, low_cpu_mem_usage=True) return model @st.cache() def load_examples(): with open("examples.json", "r") as f: examples = json.load(f) return examples st.set_page_config(page_icon=':laptop:', layout="wide") st.sidebar.header("Models:") models = ["CodeParrot", "OPT", "InCoder"] selected_models = st.sidebar.multiselect('Select code generation models to compare', models, default=["CodeParrot"]) st.sidebar.header("Tasks:") tasks = ["Model architecture", "Model evaluation", "Pretraining dataset", "Prompting"] selected_task = st.sidebar.selectbox("Select a task:", tasks) architectures = {} datasets = {} pipelines = {} if selected_task == "Pretraining dataset": st.title("Pretraining Datasets 📚") for model in selected_models: with open(f"datasets/{model.lower()}.txt", "r") as f: text = f.read() #datasets[model] = text st.markdown(f"## {model}:") st.markdown(text) elif selected_task == "Prompting": for model in selected_models: if model == "CodeParrot": tokenizer = load_tokenizer("lvwerra/codeparrot") model = load_model("lvwerra/codeparrot") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) pipelines[model] = pipe elif model == "InCoder": tokenizer = load_tokenizer("facebook/incoder-1B") model = load_model("facebook/incoder-1B") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) pipelines[model] = pipe else: tokenizer = load_tokenizer("facebook/opt-1.3b") model = load_model("facebook/opt-1.3b") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) pipelines[model] = pipe