cahya commited on
Commit
e770a74
1 Parent(s): bc7bea3

Add multilanguage support ;-)

Browse files

Add prompt for english and german

Files changed (3) hide show
  1. app.py +25 -7
  2. lid.176.ftz +0 -0
  3. prompts.py +8 -0
app.py CHANGED
@@ -4,9 +4,9 @@ from mtranslate import translate
4
  from prompts import PROMPT_LIST
5
  import streamlit as st
6
  import random
 
7
 
8
- token = st.secrets["flax_community_token"]
9
- headers = {"Authorization": f"Bearer {token}"}
10
  LOGO = "huggingwayang.png"
11
  MODELS = {
12
  "GPT-2 Small": {
@@ -48,9 +48,15 @@ def process(text: str,
48
  return query(payload, model_name)
49
 
50
  st.set_page_config(page_title="Indonesian GPT-2 Demo")
51
-
52
  st.title("Indonesian GPT-2")
53
 
 
 
 
 
 
 
 
54
  # Sidebar
55
  st.sidebar.image(LOGO)
56
  st.sidebar.subheader("Configurable parameters")
@@ -81,7 +87,7 @@ top_p = st.sidebar.number_input(
81
  help=" If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation."
82
  )
83
 
84
- do_sample = st.sidebar.selectbox('Sampling?', (True, False), help="Whether or not to use sampling; use greedy decoding otherwise.")
85
 
86
  st.markdown(
87
  """
@@ -90,6 +96,7 @@ st.markdown(
90
  trained on the Indonesian [Oscar](https://huggingface.co/datasets/oscar), [MC4](https://huggingface.co/datasets/mc4)
91
  and [Wikipedia](https://huggingface.co/datasets/wikipedia) dataset. We created it as part of the
92
  [Huggingface JAX/Flax event](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/).
 
93
  """
94
  )
95
 
@@ -99,7 +106,7 @@ ALL_PROMPTS = list(PROMPT_LIST.keys())+["Custom"]
99
  prompt = st.selectbox('Please choose a predefined prompt or create your custom text.', ALL_PROMPTS, index=len(ALL_PROMPTS)-1)
100
 
101
  if prompt == "Custom":
102
- prompt_box = "Enter your text here"
103
  else:
104
  prompt_box = random.choice(PROMPT_LIST[prompt])
105
 
@@ -107,6 +114,14 @@ text = st.text_area("Enter text", prompt_box)
107
 
108
  if st.button("Run"):
109
  with st.spinner(text="Getting results..."):
 
 
 
 
 
 
 
 
110
  st.subheader("Result")
111
  # print(f"maxlen:{max_len}, temp:{temp}, top_k:{top_k}, top_p:{top_p}")
112
  result = process(text=text,
@@ -129,5 +144,8 @@ if st.button("Run"):
129
  else:
130
  result = result[0]["generated_text"]
131
  st.write(result.replace("\n", " \n"))
132
- st.text("English translation")
133
- st.write(translate(result, "en", "id").replace("\n", " \n"))
 
 
 
 
4
  from prompts import PROMPT_LIST
5
  import streamlit as st
6
  import random
7
+ import fasttext
8
 
9
+ headers = {}
 
10
  LOGO = "huggingwayang.png"
11
  MODELS = {
12
  "GPT-2 Small": {
 
48
  return query(payload, model_name)
49
 
50
  st.set_page_config(page_title="Indonesian GPT-2 Demo")
 
51
  st.title("Indonesian GPT-2")
52
 
53
+ try:
54
+ token = st.secrets["flax_community_token"]
55
+ headers = {"Authorization": f"Bearer {token}"}
56
+ except FileNotFoundError:
57
+ print(f"Token is not found")
58
+
59
+ ft_model = fasttext.load_model('lid.176.ftz')
60
  # Sidebar
61
  st.sidebar.image(LOGO)
62
  st.sidebar.subheader("Configurable parameters")
 
87
  help=" If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation."
88
  )
89
 
90
+ # do_sample = st.sidebar.selectbox('Sampling?', (True, False), help="Whether or not to use sampling; use greedy decoding otherwise.")
91
 
92
  st.markdown(
93
  """
 
96
  trained on the Indonesian [Oscar](https://huggingface.co/datasets/oscar), [MC4](https://huggingface.co/datasets/mc4)
97
  and [Wikipedia](https://huggingface.co/datasets/wikipedia) dataset. We created it as part of the
98
  [Huggingface JAX/Flax event](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/).
99
+ It is also possible to write a custom prompt on your own language.
100
  """
101
  )
102
 
 
106
  prompt = st.selectbox('Please choose a predefined prompt or create your custom text.', ALL_PROMPTS, index=len(ALL_PROMPTS)-1)
107
 
108
  if prompt == "Custom":
109
+ prompt_box = "Feel free to write text in any language"
110
  else:
111
  prompt_box = random.choice(PROMPT_LIST[prompt])
112
 
 
114
 
115
  if st.button("Run"):
116
  with st.spinner(text="Getting results..."):
117
+ lang_predictions, lang_probability = ft_model.predict(text, k=3)
118
+ # print(f"lang: {lang_predictions}, {lang_probability}")
119
+ if "__label__id" in lang_predictions:
120
+ lang = "id"
121
+ else:
122
+ lang = lang_predictions[0].replace("__label__", "")
123
+ text = translate(text, "id", lang)
124
+ # print(f"{lang}: {text}")
125
  st.subheader("Result")
126
  # print(f"maxlen:{max_len}, temp:{temp}, top_k:{top_k}, top_p:{top_p}")
127
  result = process(text=text,
 
144
  else:
145
  result = result[0]["generated_text"]
146
  st.write(result.replace("\n", " \n"))
147
+ st.text("Translation")
148
+ if lang == "id":
149
+ st.write(translate(result, "en", "id").replace("\n", " \n"))
150
+ else:
151
+ st.write(translate(result, lang, "id").replace("\n", " \n"))
lid.176.ftz ADDED
Binary file (938 kB). View file
 
prompts.py CHANGED
@@ -17,5 +17,13 @@ PROMPT_LIST = {
17
  "Mohammad Natsir adalah seorang ulama, politisi, dan pejuang kemerdekaan Indonesia.",
18
  "Ir. H. Soekarno adalah Presiden pertama Republik Indonesia. Ia adalah seorang tokoh perjuangan yang memainkan peranan penting dalam memerdekakan bangsa Indonesia",
19
  "Borobudur adalah sebuah candi Buddha yang terletak di sebelah barat laut Yogyakarta. Monumen ini merupakan model alam semesta dan dibangun sebagai tempat suci untuk memuliakan Buddha"
 
 
 
 
 
 
 
 
20
  ]
21
  }
 
17
  "Mohammad Natsir adalah seorang ulama, politisi, dan pejuang kemerdekaan Indonesia.",
18
  "Ir. H. Soekarno adalah Presiden pertama Republik Indonesia. Ia adalah seorang tokoh perjuangan yang memainkan peranan penting dalam memerdekakan bangsa Indonesia",
19
  "Borobudur adalah sebuah candi Buddha yang terletak di sebelah barat laut Yogyakarta. Monumen ini merupakan model alam semesta dan dibangun sebagai tempat suci untuk memuliakan Buddha"
20
+ ],
21
+ "English": [
22
+ "Deoxyribonucleic acid is a molecule composed of two polynucleotide chains that coil around each other",
23
+ "Javanese is the largest of the Austronesian languages in number of native speakers"
24
+ ],
25
+ "German": [
26
+ "Eine Meerjungfrau, auch Seejungfrau oder Fischweib, ist ein weibliches Fabelwesen, ein Mischwesen aus Frauen- und Fischkörper",
27
+ "Der Mond ist der einzige natürliche Satellit der Erde"
28
  ]
29
  }