Spaces:
Running
Running
liujch1998
commited on
Commit
•
cb08e07
1
Parent(s):
7474206
Sync changes
Browse files- app.py +5 -4
- constants.py +2 -9
app.py
CHANGED
@@ -9,6 +9,7 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
|
|
9 |
corpus = CORPUS_BY_DESC[corpus_desc]
|
10 |
engine = ENGINE_BY_DESC[engine_desc]
|
11 |
data = {
|
|
|
12 |
'timestamp': timestamp,
|
13 |
'query_type': query_type,
|
14 |
'corpus': corpus,
|
@@ -18,9 +19,9 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
|
|
18 |
if maxnum is not None:
|
19 |
data['maxnum'] = maxnum
|
20 |
print(json.dumps(data))
|
21 |
-
if
|
22 |
-
raise ValueError(f'
|
23 |
-
response = requests.post(
|
24 |
if response.status_code == 200:
|
25 |
result = response.json()
|
26 |
else:
|
@@ -230,7 +231,7 @@ with gr.Blocks() as demo:
|
|
230 |
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
231 |
<li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
|
232 |
<li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
|
233 |
-
<li>If you query for two or more clauses, and a clause has more than {
|
234 |
<li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
|
235 |
</ul>
|
236 |
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
|
|
|
9 |
corpus = CORPUS_BY_DESC[corpus_desc]
|
10 |
engine = ENGINE_BY_DESC[engine_desc]
|
11 |
data = {
|
12 |
+
'source': 'hf' if not DEBUG else 'hf-dev',
|
13 |
'timestamp': timestamp,
|
14 |
'query_type': query_type,
|
15 |
'corpus': corpus,
|
|
|
19 |
if maxnum is not None:
|
20 |
data['maxnum'] = maxnum
|
21 |
print(json.dumps(data))
|
22 |
+
if API_URL is None:
|
23 |
+
raise ValueError(f'API_URL envvar is not set!')
|
24 |
+
response = requests.post(API_URL, json=data)
|
25 |
if response.status_code == 200:
|
26 |
result = response.json()
|
27 |
else:
|
|
|
231 |
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
|
232 |
<li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
|
233 |
<li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
|
234 |
+
<li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
|
235 |
<li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
|
236 |
</ul>
|
237 |
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
|
constants.py
CHANGED
@@ -22,22 +22,15 @@ MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
|
|
22 |
MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
|
23 |
MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
|
24 |
MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
|
25 |
-
|
26 |
-
MAX_CLAUSE_FREQ_FAST = int(os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000))
|
27 |
-
MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
|
28 |
MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
|
29 |
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
|
30 |
MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
|
31 |
MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
|
32 |
|
33 |
# HF demo
|
34 |
-
|
35 |
DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
|
36 |
MAX_SIZE = os.environ.get('MAX_SIZE', 100)
|
37 |
MAX_THREADS = os.environ.get('MAX_THREADS', 40)
|
38 |
DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
|
39 |
-
|
40 |
-
# C++ engine
|
41 |
-
CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
|
42 |
-
SOCKET_IN_BUFFER_SIZE = 2048
|
43 |
-
SOCKET_OUT_BUFFER_SIZE = 65536
|
|
|
22 |
MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
|
23 |
MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
|
24 |
MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
|
25 |
+
MAX_CLAUSE_FREQ_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
|
|
|
|
|
26 |
MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
|
27 |
MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
|
28 |
MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
|
29 |
MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
|
30 |
|
31 |
# HF demo
|
32 |
+
API_URL = os.environ.get('API_URL', None)
|
33 |
DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
|
34 |
MAX_SIZE = os.environ.get('MAX_SIZE', 100)
|
35 |
MAX_THREADS = os.environ.get('MAX_THREADS', 40)
|
36 |
DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
|
|
|
|
|
|
|
|
|
|