File size: 3,799 Bytes
c9e8e4a 3bce3fb a16fa71 41d27ac c9e8e4a fa5e188 7c0d726 0d5adbc fa5e188 c9e8e4a f4313df c9e8e4a 7c0d726 41d27ac 7c0d726 f7b6a4b 7c0d726 58551fa c5fafcd 0d5adbc 713d533 b3036ef 0d5adbc 58551fa 0d5adbc ad72f9a 0d5adbc 58551fa b3036ef 0d5adbc 58551fa 0d5adbc 58551fa 0d5adbc ad72f9a 0d5adbc ad72f9a 0d5adbc 58551fa b3036ef 99db140 0d5adbc 58551fa 0d5adbc 58551fa 0d5adbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import json
import pandas as pd
import requests
from multiprocessing import Pool
from functools import partial
import streamlit as st
GITHUB_CODE = "https://huggingface.co/datasets/lvwerra/github-code"
INCODER_IMG = (
"https://huggingface.co/datasets/loubnabnl/repo-images/raw/main/incoder.png"
)
MODELS = ["CodeParrot", "InCoder"]
@st.cache()
def load_examples():
with open("utils/examples.json", "r") as f:
examples = json.load(f)
return examples
def generate_code(model_name, gen_prompt, max_new_tokens, temperature, seed):
url = (
f"https://hf.space/embed/loubnabnl/{model_name.lower()}-subspace/+/api/predict/"
)
r = requests.post(
url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
)
generated_text = r.json()["data"][0]
return generated_text
#st.set_page_config(page_icon=":laptop:", layout="wide")
# Introduction
st.title("Code generation with π€")
with open("utils/intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
# Pretraining datasets
st.title("1 - Pretraining datasets π")
st.markdown(
f"Preview of some code files from Github repositories in [Github-code dataset]({GITHUB_CODE}):"
)
df = pd.read_csv("utils/data_preview.csv")
st.dataframe(df)
st.header("Model")
col1, col2, col3 = st.columns(3)
with col2:
selected_model = st.selectbox(
"Select a code generation model", MODELS, key=1
)
with open(f"datasets/{selected_model.lower()}.txt", "r") as f:
text = f.read()
st.markdown(text)
# Model architecture
st.title("2 - Model architecture")
st.markdown("Most code generation models use GPT style architectures trained on code. Some use encoder-decoder architectures such as AlphaCode.")
st.header("Model")
selected_model = st.selectbox(
"Select a code generation model", MODELS, key=2
)
with open(f"architectures/{selected_model.lower()}.txt", "r") as f:
text = f.read()
st.markdown(text)
if selected_model == "InCoder":
st.image(INCODER_IMG, caption="Figure 1: InCoder training", width=700)
# Model evaluation
st.title("3 - Code models evaluation π")
with open("evaluation/intro.txt", "r") as f:
intro = f.read()
st.markdown(intro)
# Code generation
st.title("4 - Code generation π»")
st.header("Models")
selected_models = st.multiselect(
"Select code generation models to compare", MODELS, default=["CodeParrot"], key=3
)
st.header("Examples")
examples = load_examples()
example_names = [example["name"] for example in examples]
name2id = dict([(name, i) for i, name in enumerate(example_names)])
selected_example = st.selectbox(
"Select one of the following examples or implement yours", example_names
)
example_text = examples[name2id[selected_example]]["value"]
default_length = examples[name2id[selected_example]]["length"]
st.header("Generation settings")
temperature = st.slider(
"Temperature:", value=0.2, min_value=0.0, step=0.1, max_value=2.0
)
max_new_tokens = st.slider(
"Number of tokens to generate:",
value=default_length,
min_value=8,
step=8,
max_value=256,
)
seed = st.slider(
"Random seed:", value=42, min_value=0, step=1, max_value=1000
)
gen_prompt = st.text_area(
"Generate code with prompt:",
value=example_text,
height=220,
).strip()
if st.button("Generate code!"):
with st.spinner("Generating code..."):
# Create a multiprocessing Pool
pool = Pool()
generate_parallel = partial(
generate_code,
gen_prompt=gen_prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
seed=seed,
)
output = pool.map(generate_parallel, selected_models)
for i in range(len(output)):
st.markdown(f"**{selected_models[i]}**")
st.code(output[i])
|