speech-test commited on
Commit
3932541
β€’
1 Parent(s): 529022b
Files changed (3) hide show
  1. README.md +6 -6
  2. app.py +132 -0
  3. requirements.txt +1 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: SpeechLeaderboard
3
- emoji: πŸ’©
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: streamlit
7
- sdk_version: 1.2.0
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
  ---
12
 
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
1
  ---
2
+ title: Speech Recognition Leaderboard
3
+ emoji: πŸ“ˆ
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: streamlit
 
7
  app_file: app.py
8
+ pinned: true
9
  license: apache-2.0
10
  ---
11
 
12
+
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py CHANGED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ from tqdm.auto import tqdm
4
+ import streamlit as st
5
+ from huggingface_hub import HfApi, hf_hub_download
6
+ from huggingface_hub.repocard import metadata_load
7
+
8
+ cer_langs = ["ja", "zh-CN", "zh-HK", "zh-TW"]
9
+
10
+
11
+ def make_clickable(model_name):
12
+ link = "https://huggingface.co/" + model_name
13
+ return f'<a target="_blank" href="{link}">{model_name}</a>'
14
+
15
+
16
+ def get_model_ids():
17
+ api = HfApi()
18
+ models = api.list_models(filter="robust-speech-event")
19
+ model_ids = [x.modelId for x in models]
20
+ return model_ids
21
+
22
+
23
+ def get_metadata(model_id):
24
+ try:
25
+ readme_path = hf_hub_download(model_id, filename="README.md")
26
+ return metadata_load(readme_path)
27
+ except requests.exceptions.HTTPError:
28
+ # 404 README.md not found
29
+ return None
30
+
31
+
32
+ def parse_metric_value(value):
33
+ if isinstance(value, str):
34
+ "".join(value.split("%"))
35
+ try:
36
+ value = float(value)
37
+ except: # noqa: E722
38
+ value = None
39
+ elif isinstance(value, float) and value < 1.0:
40
+ # assuming that WER is given in 0.xx format
41
+ value = 100 * value
42
+ elif isinstance(value, list):
43
+ if len(value) > 0:
44
+ value = value[0]
45
+ else:
46
+ value = None
47
+ value = round(value, 2) if value is not None else None
48
+ return value
49
+
50
+
51
+ def parse_metrics_row(meta):
52
+ if "model-index" not in meta or "language" not in meta:
53
+ return None
54
+ lang = meta["language"]
55
+ lang = lang[0] if isinstance(lang, list) else lang
56
+ for result in meta["model-index"][0]["results"]:
57
+ if "dataset" not in result or "metrics" not in result:
58
+ continue
59
+ dataset = result["dataset"]["type"]
60
+ if "args" not in result["dataset"]:
61
+ continue
62
+ dataset_config = result["dataset"]["args"]
63
+ row = {"dataset": dataset, "lang": lang}
64
+ for metric in result["metrics"]:
65
+ type = metric["type"].lower().strip()
66
+ if type not in ["wer", "cer"]:
67
+ continue
68
+ value = parse_metric_value(metric["value"])
69
+ if value is None:
70
+ continue
71
+ if type not in row or value < row[type]:
72
+ # overwrite the metric if the new value is lower (e.g. with LM)
73
+ row[type] = value
74
+ if "wer" in row or "cer" in row:
75
+ return row
76
+ return None
77
+
78
+
79
+ @st.cache(ttl=600)
80
+ def get_data():
81
+ data = []
82
+ model_ids = get_model_ids()
83
+ for model_id in tqdm(model_ids):
84
+ meta = get_metadata(model_id)
85
+ if meta is None:
86
+ continue
87
+ row = parse_metrics_row(meta)
88
+ if row is None:
89
+ continue
90
+ row["model_id"] = model_id
91
+ data.append(row)
92
+ return pd.DataFrame.from_records(data)
93
+
94
+
95
+ dataframe = get_data()
96
+ dataframe = dataframe.fillna("")
97
+ dataframe["model_id"] = dataframe["model_id"].apply(make_clickable)
98
+
99
+ _, col_center = st.columns([3, 6])
100
+ with col_center:
101
+ st.image("logo.png", width=200)
102
+ st.markdown("# Speech Models Leaderboard")
103
+
104
+ lang = st.selectbox(
105
+ "Language",
106
+ sorted(dataframe["lang"].unique()),
107
+ index=0,
108
+ )
109
+ lang_df = dataframe[dataframe.lang == lang]
110
+
111
+ dataset = st.selectbox(
112
+ "Dataset",
113
+ sorted(lang_df["dataset"].unique()),
114
+ index=0,
115
+ )
116
+ dataset_df = lang_df[lang_df.dataset == dataset]
117
+ if lang in cer_langs:
118
+ dataset_df = dataset_df[["model_id", "cer"]]
119
+ dataset_df.sort_values("cer", inplace=True)
120
+ else:
121
+ dataset_df = dataset_df[["model_id", "wer"]]
122
+ dataset_df.sort_values("wer", inplace=True)
123
+ dataset_df.rename(
124
+ columns={
125
+ "model_id": "Model",
126
+ "wer": "WER (lower is better)",
127
+ "cer": "CER (lower is better)",
128
+ },
129
+ inplace=True,
130
+ )
131
+
132
+ st.write(dataset_df.to_html(escape=False, index=None), unsafe_allow_html=True)
requirements.txt CHANGED
@@ -0,0 +1 @@
 
 
1
+ pandas