Spaces:

imperialwool
/

llama-cpp-api

Running

App Files Files Community

toaster61 commited on Nov 10, 2023

Commit

c43baac

1 Parent(s): fb62a18

7b model

Browse files

Files changed (2) hide show

Dockerfile +1 -1
gradio_app.py +5 -4

Dockerfile CHANGED Viewed

@@ -19,7 +19,7 @@ RUN mkdir translator
 RUN chmod -R 777 translator
 # Installing wget and downloading model.
-ADD https://huggingface.co/TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF/resolve/main/wizardlm-1.0-uncensored-llama2-13b.Q6_K.gguf /app/model.bin
 RUN chmod -R 777 /app/model.bin
 # You can use other models! Or u can comment this two RUNs and include in Space/repo/Docker image own model with name "model.bin".

 RUN chmod -R 777 translator
 # Installing wget and downloading model.
+ADD https://huggingface.co/TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF/resolve/main/dolphin-2.2.1-ashhlimarp-mistral-7b.Q5_0.gguf /app/model.bin
 RUN chmod -R 777 /app/model.bin
 # You can use other models! Or u can comment this two RUNs and include in Space/repo/Docker image own model with name "model.bin".

gradio_app.py CHANGED Viewed

@@ -17,7 +17,7 @@ print("! SETTING MODEL IN EVALUATION MODE !")
 translator_model.eval()
 print("! INITING LLAMA MODEL !")
 llm = Llama(model_path="./model.bin")                              # LLaMa model
-llama_model_name = "TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GGUF"
 print("! INITING DONE !")
 # Preparing things to work
@@ -25,7 +25,7 @@ translator_tokenizer.src_lang = "en"
 title = "llama.cpp API"
 desc = '''<h1>Hello, world!</h1>
 This is showcase how to make own server with Llama2 model.<br>
-I'm using here 13b model just for example. Also here's only CPU power.<br>
 But you can use GPU power as well!<br><br>
 <h1>How to GPU?</h1>
 Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
@@ -69,7 +69,7 @@ def generate_answer(request: str, max_tokens: int = 256, language: str = "en", c
     try:
         # this shitty fix will be until i willnt figure out why sometimes there is empty output
         counter = 1
-        while True:
             logs += f"Attempt {counter} to generate answer...\n"
             output = llm(userPrompt, max_tokens=maxTokens, stop=["User:"], echo=False)
             text = output["choices"][0]["text"]
@@ -77,7 +77,8 @@ def generate_answer(request: str, max_tokens: int = 256, language: str = "en", c
                 break
             counter += 1
         logs += f"Final attempt: {counter}\n"
         if language in languages and language != "en":
             logs += f"\nTranslating from en to {language}"

 translator_model.eval()
 print("! INITING LLAMA MODEL !")
 llm = Llama(model_path="./model.bin")                              # LLaMa model
+llama_model_name = "TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF"
 print("! INITING DONE !")
 # Preparing things to work
 title = "llama.cpp API"
 desc = '''<h1>Hello, world!</h1>
 This is showcase how to make own server with Llama2 model.<br>
+I'm using here 7b model just for example. Also here's only CPU power.<br>
 But you can use GPU power as well!<br><br>
 <h1>How to GPU?</h1>
 Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
     try:
         # this shitty fix will be until i willnt figure out why sometimes there is empty output
         counter = 1
+        while counter <= 10:
             logs += f"Attempt {counter} to generate answer...\n"
             output = llm(userPrompt, max_tokens=maxTokens, stop=["User:"], echo=False)
             text = output["choices"][0]["text"]
                 break
             counter += 1
         logs += f"Final attempt: {counter}\n"
+        if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']:
+            text = "Sorry, but something went wrong while generating answer. Try again or fix code. If you are maintainer of this space, look into logs."
         if language in languages and language != "en":
             logs += f"\nTranslating from en to {language}"