Spaces:
Running
Running
christopher
commited on
Commit
โข
8de89ad
0
Parent(s):
Duplicate from bigcode/py-search
Browse files- .gitattributes +34 -0
- README.md +14 -0
- app.py +98 -0
- requirements.txt +2 -0
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
25 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: StarCoder Python Search
|
3 |
+
emoji: ๐๐๐
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.12.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
duplicated_from: bigcode/py-search
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
from elasticsearch import Elasticsearch
|
9 |
+
from elasticsearch_dsl import Search, Q
|
10 |
+
|
11 |
+
es = Elasticsearch(os.environ.get("host"), timeout=10)
|
12 |
+
|
13 |
+
def mark_tokens_bold(string, tokens):
|
14 |
+
for token in tokens:
|
15 |
+
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
|
16 |
+
string = re.sub(pattern, "<span style='color: #ff75b3;'><b>" + token + "</b></span>", string)
|
17 |
+
return string
|
18 |
+
|
19 |
+
|
20 |
+
def process_results(results):
|
21 |
+
if len(results) == 0:
|
22 |
+
return """<br><p>No results retrieved.</p><br><hr>"""
|
23 |
+
|
24 |
+
results_html = ""
|
25 |
+
for result in results:
|
26 |
+
text_html = result["text"]
|
27 |
+
# text_html = mark_tokens_bold(text_html, highlight_terms)
|
28 |
+
repository = result["repository"]
|
29 |
+
|
30 |
+
results_html += """\
|
31 |
+
<p style='font-size:16px; text-align: left; color: white;'>Repository: <span style='color: #727cd6;'>{}</span></p>
|
32 |
+
<br>
|
33 |
+
<pre style='height: 600px; overflow-y: scroll; overflow-x: hidden; color: #d9d9d9;border: 1px solid #ff75b3; padding: 10px'><code>{}</code></pre>
|
34 |
+
<br>
|
35 |
+
<hr>
|
36 |
+
<br>
|
37 |
+
""".format(repository, text_html)
|
38 |
+
return results_html
|
39 |
+
|
40 |
+
|
41 |
+
def match_query(query, num_results=10):
|
42 |
+
s = Search(using=es, index=os.environ.get("index"))
|
43 |
+
s.query = Q("match", source__content=query)
|
44 |
+
s = s[:num_results]
|
45 |
+
response = s.execute()
|
46 |
+
return response
|
47 |
+
|
48 |
+
def phrase_query(query, num_results=10):
|
49 |
+
s = Search(using=es, index=os.environ.get("index"))
|
50 |
+
s.query = Q("match_phrase", source__content=query)
|
51 |
+
s = s[:num_results]
|
52 |
+
response = s.execute()
|
53 |
+
return response
|
54 |
+
|
55 |
+
def search(query, num_results=10):
|
56 |
+
if query[0]=='"' and query[-1]=='"':
|
57 |
+
response = phrase_query(query, num_results=num_results)
|
58 |
+
else:
|
59 |
+
response = match_query(query, num_results=num_results)
|
60 |
+
results = [{"text": hit.source.content, "repository": f"{hit.source.username}/{hit.source.repository}"} for hit in response]
|
61 |
+
return process_results(results)
|
62 |
+
|
63 |
+
|
64 |
+
description = """# <p style="text-align: center; color: white;"><span style='color: #ff75b3;'>StarCoder:</span> Python Dataset Search ๐ </p>
|
65 |
+
<span style='color: white;'>When you use <a href="https://huggingface.co/bigcode/large-model" style="color: #ff75b3;">StarCoder</a> to generate code it might produce exact copies of code in the pretraining dataset.
|
66 |
+
In that case, the code license might have requirements to comply with.
|
67 |
+
With this search tool we aim to provide help to find out where the code came from, in order for the user to comply with licensing requirements in case the code produced by StarCoder belongs to an already existing repository. For exact matches, enclose your query in double quotes.</span>"""
|
68 |
+
|
69 |
+
|
70 |
+
if __name__ == "__main__":
|
71 |
+
demo = gr.Blocks(
|
72 |
+
css=".gradio-container {background-color: #20233fff; color:white}"
|
73 |
+
)
|
74 |
+
|
75 |
+
with demo:
|
76 |
+
with gr.Row():
|
77 |
+
gr.Markdown(value=description)
|
78 |
+
with gr.Row():
|
79 |
+
query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query")
|
80 |
+
with gr.Row():
|
81 |
+
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
|
82 |
+
with gr.Row():
|
83 |
+
submit_btn = gr.Button("Submit")
|
84 |
+
with gr.Row():
|
85 |
+
results = gr.HTML(label="Results", value="<img src='https://huggingface.co/datasets/bigcode/admin/resolve/main/bigcode_contact.png' alt='contact' style='display: block; margin: auto; max-width: 800px;'>")
|
86 |
+
|
87 |
+
def submit(query, k, lang="en"):
|
88 |
+
query = query.strip()
|
89 |
+
if query is None or query == "":
|
90 |
+
return "", ""
|
91 |
+
return {
|
92 |
+
results: search(query, k),
|
93 |
+
}
|
94 |
+
|
95 |
+
query.submit(fn=submit, inputs=[query, k], outputs=[results])
|
96 |
+
submit_btn.click(submit, inputs=[query, k], outputs=[results])
|
97 |
+
|
98 |
+
demo.launch(enable_queue=True, debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
elasticsearch
|
2 |
+
elasticsearch-dsl
|