File size: 2,585 Bytes
57535ba
9663a4b
6bbf952
 
 
57535ba
 
 
 
 
 
 
 
6bbf952
57535ba
 
 
9663a4b
57535ba
6bbf952
 
 
 
 
 
 
 
57535ba
9663a4b
57535ba
 
 
 
 
 
 
9663a4b
6bbf952
57535ba
9663a4b
 
 
 
6bbf952
 
 
9663a4b
6bbf952
 
9663a4b
6bbf952
 
 
 
 
 
 
 
9663a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bbf952
57535ba
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from umap_reducer import UMAPReducer
from embeddings_encoder import EmbeddingsEncoder
from flask import Flask, request, render_template, jsonify, make_response, session
from flask_session import Session
from flask_cors import CORS, cross_origin
import os
from dotenv import load_dotenv
import feedparser
import json
from dateutil import parser
import re
import numpy as np
import gzip
import hashlib

load_dotenv()


app = Flask(__name__, static_url_path='/static')
app.config["SECRET_KEY"] = os.environ.get("SECRET_KEY") 
app.config["SESSION_PERMANENT"] = True
app.config["SESSION_TYPE"] = "filesystem"
app.config["SESSION_COOKIE_SAMESITE"] = "None"
app.config["SESSION_COOKIE_SECURE"] = True
Session(app)
CORS(app)

reducer = UMAPReducer()
encoder = EmbeddingsEncoder()


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/run-umap', methods=['POST'])
@cross_origin(supports_credentials=True)
def run_umap():
    input_data = request.get_json()
    sentences = input_data['data']['sentences']
    umap_options = input_data['data']['umap_options']
    cluster_options = input_data['data']['cluster_options']
    # create unique hash for input, avoid recalculating embeddings
    sentences_input_hash = hashlib.sha256(
        ''.join(sentences).encode("utf-8")).hexdigest()

    print("input options:", sentences_input_hash,
          umap_options, cluster_options, "\n\n")
    try:
        if not session.get(sentences_input_hash):
            print("New input, calculating embeddings" "\n\n")
            embeddings = encoder.encode(sentences)
            session[sentences_input_hash] = embeddings.tolist()
        else:
            print("Input already calculated, using cached embeddings", "\n\n")
            embeddings = session[sentences_input_hash]

        # UMAP embeddings
        reducer.setParams(umap_options, cluster_options)
        umap_embeddings = reducer.embed(embeddings)
        # HDBScan cluster analysis
        clusters = reducer.clusterAnalysis(umap_embeddings)
        content = gzip.compress(json.dumps(
            {
                "embeddings": umap_embeddings.tolist(),
                "clusters": clusters.labels_.tolist()
            }
        ).encode('utf8'), 5)
        response = make_response(content)
        response.headers['Content-length'] = len(content)
        response.headers['Content-Encoding'] = 'gzip'
        return response
    except Exception as e:
        return jsonify({"error": str(e)}), 400


if __name__ == '__main__':
    app.run(host='0.0.0.0',  port=int(os.environ.get('PORT', 7860)))