File size: 3,689 Bytes
c9e8e4a a16fa71 c9e8e4a 3bce3fb a16fa71 c9e8e4a 5740d40 c5fafcd dc737c0 7982bc6 a16fa71 5740d40 a16fa71 7982bc6 d297c51 5740d40 c9e8e4a 7cf1a13 33c3beb e22d1d3 7982bc6 a16fa71 9be3f4c f25abd8 3e9df94 1cb474a 3bce3fb c9e8e4a 5740d40 a16fa71 f25abd8 5b9eb09 f25abd8 a19ffbb f25abd8 5740d40 7cf1a13 90b0361 8164071 18d2e11 22fef42 a16fa71 7cf1a13 a16fa71 5740d40 7cf1a13 3283b93 a16fa71 7cf1a13 a16fa71 5740d40 a16fa71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import torch
import json
import pandas as pd
import requests
@st.cache(allow_output_mutation=True)
def load_tokenizer(model_ckpt):
return AutoTokenizer.from_pretrained(model_ckpt)
@st.cache(allow_output_mutation=True)
def load_model(model_ckpt):
model = AutoModelForCausalLM.from_pretrained(model_ckpt, low_cpu_mem_usage=True)
return model
@st.cache()
def load_examples():
with open("examples.json", "r") as f:
examples = json.load(f)
return examples
st.set_page_config(page_icon=":laptop:", layout="wide")
st.sidebar.header("Models")
models = ["CodeParrot", "InCoder"]
selected_models = st.sidebar.multiselect("Select code generation models to compare", models, default=["CodeParrot"])
st.sidebar.header("Tasks")
tasks = [" ", "Pretraining datasets", "Model architecture", "Model evaluation", "Code generation"]
selected_task = st.sidebar.selectbox("Select a task", tasks)
if selected_task == " ":
st.title("Code Generation Models")
with open("intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
elif selected_task == "Pretraining datasets":
st.title("Pretraining datasets π")
st.markdown("Preview of some code files from Github repositories")
df = pd.read_csv("data_preview.csv")
st.dataframe(df)
for model in selected_models:
with open(f"datasets/{model.lower()}.txt", "r") as f:
text = f.read()
st.markdown(f"### {model}")
st.markdown(text)
elif selected_task == "Model architecture":
st.title("Model architecture π¨")
for model in selected_models:
with open(f"architectures/{model.lower()}.txt", "r") as f:
text = f.read()
st.markdown(f"## {model}")
st.markdown(text)
if model == "InCoder":
st.image("https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png", caption="Figure 1: InCoder training", width=700)
elif selected_task == "Model evaluation":
st.title("Code models evaluation π")
with open("evaluation/intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
elif selected_task == "Code generation":
st.title("Code generation π»")
st.sidebar.header("Examples")
examples = load_examples()
example_names = [example["name"] for example in examples]
name2id = dict([(name, i) for i, name in enumerate(example_names)])
selected_example = st.sidebar.selectbox("Select one of the following examples", example_names)
example_text = examples[name2id[selected_example]]["value"]
default_length = examples[name2id[selected_example]]["length"]
st.sidebar.header("Generation settings")
temperature = st.sidebar.slider("Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0)
max_new_tokens = st.sidebar.slider("Number of tokens to generate:", value=default_length, min_value=8, step=8, max_value=256)
seed = st.sidebar.slider("Random seed:", value=42, min_value=0, step=1, max_value=1000)
gen_prompt = st.text_area("Generate code with prompt:", value=example_text, height=220,).strip()
if st.button("Generate code!"):
with st.spinner("Generating code..."):
for model in selected_models:
url = f'https://hf.space/embed/loubnabnl/{model.lower()}-subspace/+/api/predict/'
r = requests.post(url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]})
generated_text = r.json()['data'][0]
st.markdown(f"{model}")
st.code(generated_text)
|