ola13 commited on
Commit
2ce847c
β€’
1 Parent(s): aa7dd1c

ROOTS search tool

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. Makefile +7 -0
  3. README.md +5 -4
  4. app.py +236 -0
  5. roots_search_tool_specs.pdf +3 -0
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ *.pdf filter=lfs diff=lfs merge=lfs -text
Makefile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .PHONY: style
2
+
3
+ # Format source code automatically
4
+
5
+ style:
6
+ black --line-length 119 --target-version py36 .
7
+ isort .
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Roots Search
3
- emoji: πŸ“Š
4
- colorFrom: pink
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 3.7
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Roots Search Tool
3
+ emoji: 🌸 πŸ”Ž
4
+ colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.7
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client as http_client
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import string
7
+
8
+ import gradio as gr
9
+ import requests
10
+
11
+
12
+ def get_docid_html(docid):
13
+ data_org, dataset, docid = docid.split("/")
14
+
15
+ docid_html = """<a
16
+ class="underline-on-hover"
17
+ title="I am hovering over the text"
18
+ style="color:#2D31FA;"
19
+ href="https://huggingface.co/datasets/bigscience-data/{}"
20
+ target="_blank">{}</a><span style="color: #7978FF;">/{}</span>""".format(
21
+ dataset, data_org + "/" + dataset, docid
22
+ )
23
+ return docid_html
24
+
25
+
26
+ PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
27
+ PII_PREFIX = "PI:"
28
+
29
+
30
+ def process_pii(text):
31
+ for tag in PII_TAGS:
32
+ text = text.replace(
33
+ PII_PREFIX + tag,
34
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
35
+ )
36
+ return text
37
+
38
+
39
+ def process_results(results, highlight_terms):
40
+ if len(results) == 0:
41
+ return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
42
+ No results retrieved.</p><br><hr>"""
43
+
44
+ results_html = ""
45
+ for result in results:
46
+ tokens = result["text"].split()
47
+ tokens_html = []
48
+ for token in tokens:
49
+ if token in highlight_terms:
50
+ tokens_html.append("<b>{}</b>".format(token))
51
+ else:
52
+ tokens_html.append(token)
53
+ tokens_html = " ".join(tokens_html)
54
+ tokens_html = process_pii(tokens_html)
55
+ meta_html = (
56
+ """
57
+ <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
58
+ <a href='{}' target='_blank'>{}</a></p>""".format(
59
+ result["meta"]["url"], result["meta"]["url"]
60
+ )
61
+ if "meta" in result and result["meta"] is not None and "url" in result["meta"]
62
+ else ""
63
+ )
64
+ docid_html = get_docid_html(result["docid"])
65
+ results_html += """{}
66
+ <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
67
+ <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
68
+ <p style='font-family: Arial;'>{}</p>
69
+ <br>
70
+ """.format(
71
+ meta_html, docid_html, result["lang"], tokens_html
72
+ )
73
+ return results_html + "<hr>"
74
+
75
+
76
+ def scisearch(query, language, num_results=10):
77
+ try:
78
+ post_data = {"query": query, "k": num_results}
79
+ if language != "detect_language":
80
+ post_data["lang"] = language
81
+
82
+ output = requests.post(
83
+ os.environ.get("address"),
84
+ headers={"Content-type": "application/json"},
85
+ data=json.dumps(post_data),
86
+ timeout=60,
87
+ )
88
+
89
+ payload = json.loads(output.text)
90
+
91
+ if "err" in payload:
92
+ if payload["err"]["type"] == "unsupported_lang":
93
+ detected_lang = payload["err"]["meta"]["detected_lang"]
94
+ return f"""
95
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
96
+ Detected language <b>{detected_lang}</b> is not supported.<br>
97
+ Please choose a language from the dropdown or type another query.
98
+ </p><br><hr><br>"""
99
+
100
+ results = payload["results"]
101
+ highlight_terms = payload["highlight_terms"]
102
+
103
+ if language == "detect_language":
104
+ return (
105
+ (
106
+ f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
107
+ Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
108
+ if len(results) > 0 and language == "detect_language"
109
+ else ""
110
+ )
111
+ + process_results(results, highlight_terms)
112
+ )
113
+
114
+ if language == "all":
115
+ results_html = ""
116
+ for lang, results_for_lang in results.items():
117
+ if len(results_for_lang) == 0:
118
+ results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
119
+ No results for language: <b>{lang}</b><hr></p>"""
120
+ continue
121
+
122
+ collapsible_results = f"""
123
+ <details>
124
+ <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
125
+ Results for language: <b>{lang}</b><hr>
126
+ </summary>
127
+ {process_results(results_for_lang, highlight_terms)}
128
+ </details>"""
129
+ results_html += collapsible_results
130
+ return results_html
131
+
132
+ return process_results(results, highlight_terms)
133
+
134
+ except Exception as e:
135
+ results_html = f"""
136
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
137
+ Raised {type(e).__name__}</p>
138
+ <p style='font-size:14px; font-family: Arial; '>
139
+ Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
140
+ </p>
141
+ """
142
+
143
+ return results_html
144
+
145
+
146
+ def flag(query, language, num_results, issue_description):
147
+ try:
148
+ post_data = {"query": query, "k": num_results, "flag": True, "description": issue_description}
149
+ if language != "detect_language":
150
+ post_data["lang"] = language
151
+
152
+ output = requests.post(
153
+ os.environ.get("address"),
154
+ headers={"Content-type": "application/json"},
155
+ data=json.dumps(post_data),
156
+ timeout=120,
157
+ )
158
+
159
+ results = json.loads(output.text)
160
+ except:
161
+ print("Error flagging")
162
+ return ""
163
+
164
+
165
+ description = """# <p style="text-align: center;"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>
166
+ The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
167
+ of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
168
+ you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
169
+ ROOTS. You can read more about the details of the tool design
170
+ [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
171
+ information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""
172
+
173
+
174
+ if __name__ == "__main__":
175
+ demo = gr.Blocks(
176
+ css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
177
+ )
178
+
179
+ with demo:
180
+ with gr.Row():
181
+ gr.Markdown(value=description)
182
+ with gr.Row():
183
+ query = gr.Textbox(lines=2, placeholder="Type your query here...", label="Query")
184
+ with gr.Row():
185
+ lang = gr.Dropdown(
186
+ choices=[
187
+ "ar",
188
+ "ca",
189
+ "code",
190
+ "en",
191
+ "es",
192
+ "eu",
193
+ "fr",
194
+ "id",
195
+ "indic",
196
+ "nigercongo",
197
+ "pt",
198
+ "vi",
199
+ "zh",
200
+ "detect_language",
201
+ "all",
202
+ ],
203
+ value="pt",
204
+ label="Language",
205
+ )
206
+ with gr.Row():
207
+ k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
208
+ with gr.Row():
209
+ submit_btn = gr.Button("Submit")
210
+ with gr.Row():
211
+ results = gr.HTML(label="Results")
212
+ flag_description = """
213
+ <p class='flagging'>
214
+ If you choose to flag your search, we will save the query, language and the number of results you requested.
215
+ Please consider adding any additional context in the box on the right.</p>"""
216
+ with gr.Column(visible=False) as flagging_form:
217
+ flag_txt = gr.Textbox(
218
+ lines=1,
219
+ placeholder="Type here...",
220
+ label="""If you choose to flag your search, we will save the query, language and the number of results
221
+ you requested. Please consider adding relevant additional context below:""",
222
+ )
223
+ flag_btn = gr.Button("Flag Results")
224
+ flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
225
+
226
+ def submit(query, lang, k):
227
+ if query == "":
228
+ return ["", ""]
229
+ return {
230
+ results: scisearch(query, lang, k),
231
+ flagging_form: gr.update(visible=True),
232
+ }
233
+
234
+ submit_btn.click(submit, inputs=[query, lang, k], outputs=[results, flagging_form])
235
+
236
+ demo.launch(enable_queue=True, debug=True)
roots_search_tool_specs.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44cf8caa3819e8abf036178c4d329363d0649def28ec4e0979a0e0b2b94362d9
3
+ size 2642911