Loubna ben allal
commited on
Commit
•
c9e8e4a
1
Parent(s):
d490108
add files
Browse files
app.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
|
3 |
+
from transformers import pipeline
|
4 |
+
import torch
|
5 |
+
import json
|
6 |
+
|
7 |
+
|
8 |
+
@st.cache(allow_output_mutation=True)
|
9 |
+
def load_tokenizer(model_ckpt):
|
10 |
+
return AutoTokenizer.from_pretrained(model_ckpt)
|
11 |
+
|
12 |
+
@st.cache(allow_output_mutation=True)
|
13 |
+
def load_model(model_ckpt):
|
14 |
+
model = AutoModelForCausalLM.from_pretrained(model_ckpt, low_cpu_mem_usage=True)
|
15 |
+
return model
|
16 |
+
|
17 |
+
@st.cache()
|
18 |
+
def load_examples():
|
19 |
+
with open("examples.json", "r") as f:
|
20 |
+
examples = json.load(f)
|
21 |
+
return examples
|
22 |
+
|
23 |
+
st.set_page_config(page_icon=':parrot:', layout="wide")
|
24 |
+
|
25 |
+
tokenizer1 = load_tokenizer("lvwerra/codeparrot")
|
26 |
+
model1 = load_model("lvwerra/codeparrot")
|
27 |
+
|
28 |
+
tokenizer2 = load_tokenizer("facebook/opt-1.3b")
|
29 |
+
model2 = load_model("facebook/opt-1.3b")
|
30 |
+
|
31 |
+
tokenizer3 = load_tokenizer("facebook/incoder-1B")
|
32 |
+
model3 = load_model("facebook/incoder-1B")
|
33 |
+
|
34 |
+
st.sidebar.header("Models:")
|
35 |
+
models = ["CodeParrot", "OPT", "InCoder"]
|
36 |
+
selected_models = st.multiselect('Select code generation models to compare',
|
37 |
+
models,
|
38 |
+
default=["CodeParrot"])
|
39 |
+
st.sidebar.header("Tasks:")
|
40 |
+
taks = ["Model architecture", "Model evaluation", "Pretraining dataset", "Prompting"]
|
41 |
+
selected_task = st.sidebar.selectbox("Select a task:", tasks, default="Model architecture")
|
42 |
+
|
43 |
+
st.title("Code Generation Models👩💻")
|
44 |
+
|
45 |
+
architectures = {}
|
46 |
+
datasets = {}
|
47 |
+
pipelines = {}
|
48 |
+
if selected_task == "Model architecture":
|
49 |
+
st.markdown("## Model architectures")
|
50 |
+
for model in selected_models:
|
51 |
+
with open(f"datasets/{model.lower()}.txt", "r") as f:
|
52 |
+
text = f.read()
|
53 |
+
#architectures[model] = text
|
54 |
+
st.markdown(f"### {model}:")
|
55 |
+
st.markdown(text)
|
56 |
+
|
57 |
+
elif selected_task == "Pretraining dataset":
|
58 |
+
st.markdown("## Pretraining Datasets")
|
59 |
+
for model in selected_models:
|
60 |
+
with open(f"datasets/{model.lower()}.txt", "r") as f:
|
61 |
+
text = f.read()
|
62 |
+
#datasets[model] = text
|
63 |
+
st.markdown(f"### {model}:")
|
64 |
+
st.markdown(text)
|
65 |
+
|
66 |
+
elif selected_task == "Prompting":
|
67 |
+
for model in selected_models:
|
68 |
+
if model == "CodeParrot":
|
69 |
+
tokenizer = load_tokenizer("lvwerra/codeparrot")
|
70 |
+
model = load_model("lvwerra/codeparrot")
|
71 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
72 |
+
pipelines[model] = pipe
|
73 |
+
elif model == "InCoder":
|
74 |
+
tokenizer = load_tokenizer("facebook/incoder-1B")
|
75 |
+
model = load_model("facebook/incoder-1B")
|
76 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
77 |
+
pipelines[model] = pipe
|
78 |
+
else:
|
79 |
+
tokenizer = load_tokenizer("facebook/opt-1.3b")
|
80 |
+
model = load_model("facebook/opt-1.3b")
|
81 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
82 |
+
pipelines[model] = pipe
|
datasets/.ipynb_checkpoints/codeparrot-checkpoint.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[CodeParrot](https://huggingface.co/lvwerra/codeparrot) was trained on **50GB** of Python data from Github repositories: [CodeParrot dataset](https://huggingface.co/datasets/lvwerra/codeparrot-clean). The original dataset contains a lot of duplicated and noisy data. Therefore, the dataset was cleaned with the following steps:
|
2 |
+
- Exact match deduplication
|
3 |
+
- Filtering
|
4 |
+
- Average line length < 100
|
5 |
+
- Maximum line length < 1000
|
6 |
+
- Alpha numeric characters fraction > 0.25
|
7 |
+
- Remove auto-generated files (keyword search)
|
8 |
+
|
9 |
+
For more details see the preprocessing script in the transformers repository [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/codeparrot).
|
datasets/.ipynb_checkpoints/incoder-checkpoint.txt
ADDED
File without changes
|
datasets/.ipynb_checkpoints/opt-checkpoint.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[OPT](https://huggingface.co/facebook/opt-30b) was trained on the following 5 filtered datasets of textual documents, one of them includes code, [The Pile](https://arxiv.org/pdf/2101.00027v1.pdf), it used *Pile-CC, OpenWebText2, USPTO, Project Gutenberg, OpenSubtitles, Wikipedia, DM Mathematics and HackerNews*.
|
2 |
+
The final training data contains 180B tokens corresponding to 800GB of data. For more details please refer to this [paper](https://arxiv.org/abs/2205.01068)
|
datasets/codeparrot.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[CodeParrot](https://huggingface.co/lvwerra/codeparrot) was trained on **50GB** of Python data from Github repositories: [CodeParrot dataset](https://huggingface.co/datasets/lvwerra/codeparrot-clean). The original dataset contains a lot of duplicated and noisy data. Therefore, the dataset was cleaned with the following steps:
|
2 |
+
- Exact match deduplication
|
3 |
+
- Filtering
|
4 |
+
- Average line length < 100
|
5 |
+
- Maximum line length < 1000
|
6 |
+
- Alpha numeric characters fraction > 0.25
|
7 |
+
- Remove auto-generated files (keyword search)
|
8 |
+
|
9 |
+
For more details see the preprocessing script in the transformers repository [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/codeparrot).
|
datasets/incoder.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[InCoder](https://huggingface.co/facebook/incoder-6B) was trained on trained on 216 GB of data from Github and Stackoverflow from 28 programming languages. 52 GB rae in Python, 107GB in other programming languages and 57GB is content from stackoverflow that isn't code.
|
2 |
+
|
3 |
+
The Github data used the following filtering:
|
4 |
+
- Average line length < 100
|
5 |
+
- Maximum line length < 3000
|
6 |
+
- Alphanumeric characters fraction > 0.4
|
7 |
+
- Remove auto-generated files (keyword search)
|
8 |
+
|
9 |
+
The second componenet of the data consists of questions, answers, and comments from StackOverflow, it includes:
|
10 |
+
- all questions that have at least one answer
|
11 |
+
- up to ten answers with a non-negative score (sorted
|
12 |
+
by score) per question
|
13 |
+
- up to five comments per question/answer
|
14 |
+
Exact match deduplication was performed in code files.
|
15 |
+
|
16 |
+
For more details please refer to this [paper](https://arxiv.org/pdf/2204.05999.pdf).
|
datasets/opt.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[OPT](https://huggingface.co/facebook/opt-30b) was trained on the following 5 filtered datasets of textual documents, one of them includes code, [The Pile](https://arxiv.org/pdf/2101.00027v1.pdf), it used *Pile-CC, OpenWebText2, USPTO, Project Gutenberg, OpenSubtitles, Wikipedia, DM Mathematics and HackerNews*.
|
2 |
+
The final training data contains 180B tokens corresponding to 800GB of data. For more details please refer to this [paper](https://arxiv.org/abs/2205.01068)
|