Spaces:

yuhuili
/

EAGLE

Sleeping

yuhuili commited on Dec 9, 2023

Commit

9647dd6

•

1 Parent(s): b66c71a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -193,7 +193,7 @@ parser.add_argument(
 parser.add_argument("--base-model-path", type=str, default="lmsys/vicuna-7b-v1.3",
                     help="path of basemodel, huggingface project or local path")
 parser.add_argument(
-    "--load-in-8bit", action="store_true", help="Use 8-bit quantization"
 )
 parser.add_argument(
     "--load-in-4bit", action="store_true", help="Use 4-bit quantization"
@@ -233,7 +233,7 @@ with gr.Blocks(css=custom_css) as demo:
         speed_box = gr.Textbox(label="Speed", elem_id="speed", interactive=False, value="0.00 tokens/s")
         compression_box = gr.Textbox(label="Compression Ratio", elem_id="speed", interactive=False, value="0.00")
     note1 = gr.Markdown(show_label=False, interactive=False,
-                       value='''The Compression Ratio is defined as the number of generated tokens divided by the number of forward passes in the original LLM. The original LLM is Vicuna 13B, with inference conducted on RTX 3090 GPUs and at a precision of fp16.''')
     note=gr.Markdown(show_label=False,interactive=False,value='''The tokens that EAGLE correctly guesses will be highlighted in orange. Note: This highlighting may lead to special formatting rendering issues in some instances, particularly when generating code.''')

 parser.add_argument("--base-model-path", type=str, default="lmsys/vicuna-7b-v1.3",
                     help="path of basemodel, huggingface project or local path")
 parser.add_argument(
+    "--load-in-8bit", action="store_flase", help="Use 8-bit quantization"
 )
 parser.add_argument(
     "--load-in-4bit", action="store_true", help="Use 4-bit quantization"
         speed_box = gr.Textbox(label="Speed", elem_id="speed", interactive=False, value="0.00 tokens/s")
         compression_box = gr.Textbox(label="Compression Ratio", elem_id="speed", interactive=False, value="0.00")
     note1 = gr.Markdown(show_label=False, interactive=False,
+                       value='''The Compression Ratio is defined as the number of generated tokens divided by the number of forward passes in the original LLM. The original LLM is Vicuna 7B, with inference conducted on a T4 GPU and at a precision of int8.''')
     note=gr.Markdown(show_label=False,interactive=False,value='''The tokens that EAGLE correctly guesses will be highlighted in orange. Note: This highlighting may lead to special formatting rendering issues in some instances, particularly when generating code.''')