File size: 4,162 Bytes
c9e8e4a 3bce3fb a16fa71 41d27ac c9e8e4a fa5e188 7c0d726 99db140 fa5e188 c9e8e4a f4313df c9e8e4a 7c0d726 41d27ac 7c0d726 f7b6a4b 7c0d726 5740d40 c5fafcd 99db140 b6b5314 99db140 b8a51c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import json
import pandas as pd
import requests
from multiprocessing import Pool
from functools import partial
import streamlit as st
GITHUB_CODE = "https://huggingface.co/datasets/lvwerra/github-code"
INCODER_IMG = (
"https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png"
)
@st.cache()
def load_examples():
with open("utils/examples.json", "r") as f:
examples = json.load(f)
return examples
def generate_code(model_name, gen_prompt, max_new_tokens, temperature, seed):
url = (
f"https://hf.space/embed/loubnabnl/{model_name.lower()}-subspace/+/api/predict/"
)
r = requests.post(
url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
)
generated_text = r.json()["data"][0]
return generated_text
st.set_page_config(page_icon=":laptop:", layout="wide")
st.sidebar.header("Models")
models = ["CodeParrot", "InCoder"]
selected_models = st.sidebar.multiselect(
"Select code generation models to compare", models, default=["CodeParrot"]
)
st.sidebar.header("Tasks")
tasks = [
" ",
"Pretraining datasets",
"Model architecture",
"Model evaluation",
"Code generation",
]
selected_task = st.sidebar.selectbox("Select a task", tasks)
if selected_task == " ":
st.title("Code Generation Models")
with open("utils/intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
elif selected_task == "Pretraining datasets":
st.title("Pretraining datasets π")
st.markdown(
f"Preview of some code files from Github repositories in [Github-code dataset]({GITHUB_CODE}):"
)
df = pd.read_csv("utils/data_preview.csv")
st.dataframe(df)
for model in selected_models:
with open(f"datasets/{model.lower()}.txt", "r") as f:
text = f.read()
st.markdown(f"### {model}")
st.markdown(text)
elif selected_task == "Model architecture":
st.title("Model architecture")
for model in selected_models:
with open(f"architectures/{model.lower()}.txt", "r") as f:
text = f.read()
st.markdown(f"## {model}")
st.markdown(text)
if model == "InCoder":
st.image(INCODER_IMG, caption="Figure 1: InCoder training", width=700)
elif selected_task == "Model evaluation":
st.title("Code models evaluation π")
with open("evaluation/intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
elif selected_task == "Code generation":
st.title("Code generation π»")
st.sidebar.header("Examples")
examples = load_examples()
example_names = [example["name"] for example in examples]
name2id = dict([(name, i) for i, name in enumerate(example_names)])
selected_example = st.sidebar.selectbox(
"Select one of the following examples or implement yours", example_names
)
example_text = examples[name2id[selected_example]]["value"]
default_length = examples[name2id[selected_example]]["length"]
st.sidebar.header("Generation settings")
temperature = st.sidebar.slider(
"Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0
)
max_new_tokens = st.sidebar.slider(
"Number of tokens to generate:",
value=default_length,
min_value=8,
step=8,
max_value=256,
)
seed = st.sidebar.slider(
"Random seed:", value=42, min_value=0, step=1, max_value=1000
)
gen_prompt = st.text_area(
"Generate code with prompt:",
value=example_text,
height=220,
).strip()
if st.button("Generate code!"):
with st.spinner("Generating code..."):
# Create a multiprocessing Pool
pool = Pool()
generate_parallel = partial(
generate_code,
gen_prompt=gen_prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
seed=seed,
)
output = pool.map(generate_parallel, selected_models)
for i in range(len(output)):
st.markdown(f"**{selected_models[i]}**")
st.code(output[i])
|