File size: 1,940 Bytes
d727a17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from flask import Flask, jsonify, render_template, request, Response
from transformers import pipeline
import time
# as alternative https://github.com/hyperonym/basaran
app = Flask(__name__)
# using pipelines does not work for streaming.
# I probably need to implement the streaming directly with the model.
# Given LLaMA is not fully supported by HF, this might even be the better solution
# TODO: use AutoTokenizer and AutoModel
# Encode the input, generate output. Length is n input tokes + output tokens.
# Maybe look into this: https://huggingface.co/blog/how-to-generate
model_pipelines = {
"opt-6.7": pipeline("text-generation", model="distilgpt2"),
"alpaca-7": pipeline("text-generation", model="distilgpt2")
}
@app.route("/", methods=["GET"])
def index():
return render_template("index.html")
def generate_response(model, input_text):
response_parts = []
generated_text = input_text
timeout = 0
while True:
response_part = model(generated_text, max_length=10)[0]["generated_text"]
response_parts.append(response_part)
time.sleep(0.5) # Simulate processing time
generated_text += response_part
yield f"data: {response_part.replace(input_text, '')}"
if len(generated_text.replace(input_text, '')) > 2000 or timeout > 50: # Limit the length of the generated text to prevent infinite loops
break
timeout += 1
yield "data: END\n\n"
@app.route("/get_response", methods=["GET"])
def chat():
model_name = request.args.get("chatbot")
input_text = request.args.get("message")
if model_name and input_text:
model = model_pipelines[model_name]
response = model(input_text)[0]["generated_text"]
response = response.replace(input_text, "")
else:
response = "Something went wrong"
return jsonify({"response": response})
if __name__ == "__main__":
app.run(debug=True)
|