Spaces:

liujch1998
/

creativity

Running

App Files Files Community

liujch1998 commited on Sep 2, 2024

Commit

25f66ac

1 Parent(s): ab4c12e

Initial commit

Browse files

Files changed (4) hide show

README.md +4 -4
app.py +127 -0
constants.py +24 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Creativity
-emoji: 👀
-colorFrom: gray
-colorTo: red
 sdk: gradio
 sdk_version: 4.42.0
 app_file: app.py

 ---
+title: Creativity Index
+emoji: 👩🏽‍🎨
+colorFrom: blue
+colorTo: green
 sdk: gradio
 sdk_version: 4.42.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import gradio as gr
+import datetime
+import json
+import requests
+from constants import *
+def process(query_type, index_desc, **kwargs):
+    timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
+    index = INDEX_BY_DESC[index_desc]
+    data = {
+        'source': 'hf' if not DEBUG else 'hf-dev',
+        'timestamp': timestamp,
+        'query_type': query_type,
+        'index': index,
+    }
+    data.update(kwargs)
+    print(json.dumps(data))
+    if API_URL is None:
+        raise ValueError(f'API_URL envvar is not set!')
+    try:
+        response = requests.post(API_URL, json=data, timeout=10)
+    except requests.exceptions.Timeout:
+        raise ValueError('Web request timed out. Please try again later.')
+    except requests.exceptions.RequestException as e:
+        raise ValueError(f'Web request error: {e}')
+    if response.status_code == 200:
+        result = response.json()
+    else:
+        raise ValueError(f'HTTP error {response.status_code}: {response.json()}')
+    if DEBUG:
+        print(result)
+    return result
+def creativity(index_desc, query):
+    result = process('creativity', index_desc, query=query)
+    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
+    if 'error' in result:
+        ci = result['error']
+        ngram_len = NGRAM_LEN_DEFAULT
+        html = ''
+        return latency, ci, ngram_len, html
+    rs = result['rs']
+    tokens = result['tokens']
+    highlighteds_by_n = {}
+    uniqueness_by_n = {}
+    for n in range(NGRAM_LEN_MIN, NGRAM_LEN_MAX + 1):
+        highlighteds = [False] * len(tokens)
+        last_r = 0
+        for l, r in enumerate(rs):
+            if r - l < n:
+                continue
+            for i in range(max(last_r, l), r):
+                highlighteds[i] = True
+            last_r = r
+        uniqueness = sum([1 for h in highlighteds if not h]) / len(highlighteds)
+        highlighteds_by_n[n] = highlighteds
+        uniqueness_by_n[n] = uniqueness
+    ci = sum(uniqueness_by_n.values()) / len(uniqueness_by_n)
+    ci = f'{ci:.2%}'
+    ngram_len = NGRAM_LEN_DEFAULT
+    html = ''
+    highlighted = highlighteds_by_n[ngram_len]
+    line_len = 0
+    for i, (token, highlighted) in enumerate(zip(tokens, highlighteds)):
+        if line_len >= 100 and token.startswith('Ġ') and token != 'Ċ':
+            html += '<br/>'
+            line_len = 0
+        color = '0, 0, 255, 0.5'
+        if token == 'Ċ':
+            disp_token = '\\n'
+            is_linebreak = True
+        else:
+            disp_token = token.replace('Ġ', '&nbsp;')
+            is_linebreak = False
+        if highlighted:
+            html += f'<span id="hldoc-token-{i}" style="background-color: rgba{color};" class="background-color: rgba{color};">{disp_token}</span>'
+        else:
+            html += disp_token
+        if is_linebreak:
+            html += '<br/>'
+            line_len = 0
+        else:
+            line_len += len(token)
+    html = '<div><p id="hldoc" style="font-size: 16px;">' + html.strip(' ') + '</p></div>'
+    return latency, ci, ngram_len, html
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr.HTML(
+            '''<h1 text-align="center">Creativity Index</h1>
+            <p style='font-size: 16px;'>Compute the <a href="">Creativity Index</a> of a piece of text.</p>
+            <p style='font-size: 16px;'>The computed Creativity Index is based on verbatim match and is supported by <a href="https://infini-gram.io">infini-gram</a>.</p>
+            '''
+        )
+        with gr.Row():
+            with gr.Column(scale=1, min_width=240):
+                index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
+            with gr.Column(scale=3):
+                creativity_query = gr.Textbox(placeholder='Enter a piece of text here', label='Query', interactive=True, lines=10)
+                with gr.Row():
+                    creativity_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
+                    creativity_submit = gr.Button(value='Submit', variant='primary', visible=True)
+                creativity_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
+            with gr.Column(scale=4):
+                creativity_ci = gr.Label(value='', label='Creativity Index')
+                creativity_ngram_len = gr.Slider(minimum=NGRAM_LEN_MIN, maximum=NGRAM_LEN_MAX, value=NGRAM_LEN_DEFAULT, step=1, label='Length of n-gram')
+                creativity_html = gr.HTML(value='', label='Coverage')
+            creativity_clear.add([creativity_query, creativity_latency, creativity_ci, creativity_html])
+            creativity_submit.click(creativity, inputs=[index_desc, creativity_query], outputs=[creativity_latency, creativity_ci, creativity_ngram_len, creativity_html], api_name=False)
+demo.queue(
+    default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT,
+    max_size=MAX_SIZE,
+    api_open=False,
+).launch(
+    max_threads=MAX_THREADS,
+    debug=DEBUG,
+    show_api=False,
+)

constants.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+# options
+INDEX_BY_DESC = {
+    'Dolma-v1.7 (2.6T tokens)': 'v4_dolma-v1_7_llama',
+    'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
+    'Pile-train (380B tokens)': 'v4_piletrain_llama',
+    'C4-train (200B tokens)': 'v4_c4train_llama',
+    'Pile-val (390M tokens)': 'v4_pileval_llama',
+}
+INDEX_DESCS = list(INDEX_BY_DESC.keys())
+# API limits and defaults
+MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
+NGRAM_LEN_DEFAULT = int(os.environ.get('NGRAM_LEN_DEFAULT', 8))
+NGRAM_LEN_MIN = int(os.environ.get('NGRAM_LEN_MIN', 5))
+NGRAM_LEN_MAX = int(os.environ.get('NGRAM_LEN_MAX', 11))
+# HF demo
+API_URL = os.environ.get('API_URL', None)
+DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
+MAX_SIZE = os.environ.get('MAX_SIZE', 100)
+MAX_THREADS = os.environ.get('MAX_THREADS', 40)
+DEBUG = (os.environ.get('DEBUG', 'False') != 'False')

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch==1.13.1
+transformers==4.31.0
+tokenizers==0.13.3
+sentencepiece==0.1.96
+huggingface_hub==0.14.1
+requests