anton-l HF staff commited on
Commit
b34130a
Β·
verified Β·
1 Parent(s): 1bd4ed3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import html
4
+
5
+ import gradio as gr
6
+ from opensearchpy import OpenSearch, RequestsHttpConnection
7
+ from requests_aws4auth import AWS4Auth
8
+
9
+
10
+ awsauth = AWS4Auth(
11
+ os.environ.get("ACCESS_KEY"),
12
+ os.environ.get("SECRET_KEY"),
13
+ "us-east-1",
14
+ "es",
15
+ )
16
+
17
+ es = OpenSearch(
18
+ hosts=[{"host": os.environ.get("HOST"), "port": 443}],
19
+ http_auth=awsauth,
20
+ use_ssl=True,
21
+ verify_certs=True,
22
+ connection_class=RequestsHttpConnection,
23
+ http_compress=True,
24
+ timeout=200,
25
+ )
26
+
27
+
28
+ def mark_tokens_bold(text, tokens):
29
+ for token in tokens:
30
+ if token in ["<", "b", "/", ">"]:
31
+ continue
32
+ pattern = re.escape(token) # r"\b" + re.escape(token) + r"\b"
33
+ text = re.sub(pattern, "<b>" + token + "</b>", text)
34
+ return text
35
+
36
+
37
+ def process_results(results, query):
38
+ if len(results) == 0:
39
+ return """<br><p>No results retrieved.</p><br><hr>"""
40
+
41
+ results_html = ""
42
+ for result in results:
43
+ text_html = result["text"]
44
+ if query.startswith('"') and query.endswith('"'):
45
+ text_html = mark_tokens_bold(text_html, query[1:-1].split(" "))
46
+ else:
47
+ text_html = mark_tokens_bold(text_html, query.split(" "))
48
+ repository = result["repository"]
49
+ commit_id = result["commit_id"]
50
+ path = result["path"]
51
+ license = result["license"]
52
+ language = result["language"]
53
+ code_height = min(
54
+ 600, len(text_html.split("\n")) * 20
55
+ ) # limit to maximum height of 600px
56
+ results_html += """\
57
+ <p style='font-size:16px; text-align: left;'><b>Source: </b><a target="_blank" href="https://github.com/{}/blob/{}{}">{}</a>&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;<b>Language:</b> \
58
+ <span style='color: #00134d;'>{}</span>&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;<b>Licenses: </b><span style='color: #00134d;'>{}</span></p>
59
+ <pre style='height: {}px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #e6b800; padding: 10px'><code>{}</code></pre>
60
+ <hr>
61
+ """.format(
62
+ repository,
63
+ commit_id,
64
+ path,
65
+ f"{repository}/blob/{commit_id}{path}",
66
+ language,
67
+ license,
68
+ code_height,
69
+ text_html,
70
+ )
71
+ return results_html
72
+
73
+
74
+ def match_query(query, num_results=10):
75
+ query_body = {"query": {"match": {"content": query}}, "size": num_results}
76
+
77
+ response = es.search(index=os.environ.get("INDEX"), body=query_body)
78
+ hits = [hit["_source"] for hit in response["hits"]["hits"]]
79
+ return hits
80
+
81
+
82
+ def phrase_query(query, num_results=10):
83
+ query_body = {"query": {"match_phrase": {"content": query}}, "size": num_results}
84
+
85
+ response = es.search(index=os.environ.get("INDEX"), body=query_body)
86
+ hits = [hit["_source"] for hit in response["hits"]["hits"]]
87
+ return hits
88
+
89
+
90
+ def search(query, num_results=10):
91
+ print(es.ping())
92
+ query = query[:200]
93
+ if query.startswith('"') and query.endswith('"'):
94
+ response = phrase_query(query[1:-1], num_results=num_results)
95
+ else:
96
+ response = match_query(query, num_results=num_results)
97
+ results = [
98
+ {
99
+ "text": html.escape(hit["content"]),
100
+ "repository": hit["repository"],
101
+ "commit_id": hit["commit_id"],
102
+ "path": hit["path"],
103
+ "license": hit["scancode_license"] if hit["gh_license"] is None else hit["gh_license"],
104
+ "language": hit["language"],
105
+ }
106
+ for hit in response
107
+ ]
108
+ return process_results(results, query)
109
+
110
+
111
+ description = """# <p style="text-align: center;"><span style='color: #e6b800;'>StarCoder:</span> Dataset Search πŸ” </p>
112
+ <span>When using <a href="https://huggingface.co/bigcode/starcoder2-15b" style="color: #e6b800;">StarCoder</a> to generate code, it might produce close or exact copies of code in the pretraining dataset. Identifying such cases can provide important context, and help credit the original developer of the code. With this search tool, our aim is to help in identifying if the code belongs to an existing repository. For exact matches, enclose your query in double quotes. <br><br><i>This first iteration of the search tool truncates queries down to 200 characters, so as not to overwhelm the server it is currently running on. Please not that this is not a production-ready app, but rather a research tool that we make available as a proof-of-concept. If you need a reliable search app for your business or research, we would advise you to index the dataset yourself.</i></span>"""
113
+
114
+ theme = gr.themes.Monochrome(
115
+ primary_hue="indigo",
116
+ secondary_hue="blue",
117
+ neutral_hue="slate",
118
+ radius_size=gr.themes.sizes.radius_sm,
119
+ font=[
120
+ gr.themes.GoogleFont("Open Sans"),
121
+ "ui-sans-serif",
122
+ "system-ui",
123
+ "sans-serif",
124
+ ],
125
+ )
126
+ css = ".generating {visibility: hidden}"
127
+
128
+ monospace_css = """
129
+ #q-input textarea {
130
+ font-family: monospace, 'Consolas', Courier, monospace;
131
+ }
132
+ """
133
+
134
+ css = monospace_css + ".gradio-container {color: black}"
135
+
136
+
137
+ if __name__ == "__main__":
138
+ demo = gr.Blocks(
139
+ theme=theme,
140
+ css=css,
141
+ )
142
+
143
+ with demo:
144
+ with gr.Row():
145
+ gr.Markdown(value=description)
146
+ with gr.Row():
147
+ query = gr.Textbox(
148
+ lines=5,
149
+ placeholder="Type your query here...",
150
+ label="Query",
151
+ elem_id="q-input",
152
+ )
153
+ with gr.Row():
154
+ k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
155
+ with gr.Row():
156
+ submit_btn = gr.Button("Submit")
157
+ with gr.Row():
158
+ results = gr.HTML(label="Results", value="")
159
+
160
+ def submit(query, k, lang="en"):
161
+ query = query.strip()
162
+ if query is None or query == "":
163
+ return "", ""
164
+ return {
165
+ results: search(query, k),
166
+ }
167
+
168
+ query.submit(fn=submit, inputs=[query, k], outputs=[results])
169
+ submit_btn.click(submit, inputs=[query, k], outputs=[results])
170
+
171
+ demo.launch(debug=True)