Update app.py
Browse files
app.py
CHANGED
@@ -93,27 +93,27 @@ def setup_database():
|
|
93 |
|
94 |
# Создаем таблицу для хранения эмбеддингов фильмов
|
95 |
cur.execute(f"""
|
96 |
-
CREATE TABLE IF NOT EXISTS {embeddings_table} (
|
97 |
movie_id INTEGER PRIMARY KEY,
|
98 |
embedding_crc32 BIGINT,
|
99 |
string_crc32 BIGINT,
|
100 |
model_name TEXT,
|
101 |
embedding vector(1024)
|
102 |
);
|
103 |
-
CREATE INDEX IF NOT EXISTS idx_string_crc32 ON {embeddings_table} (string_crc32);
|
104 |
""")
|
105 |
|
106 |
# Создаем таблицу для кэширования запросов
|
107 |
cur.execute(f"""
|
108 |
-
CREATE TABLE IF NOT EXISTS {query_cache_table} (
|
109 |
query_crc32 BIGINT PRIMARY KEY,
|
110 |
query TEXT,
|
111 |
model_name TEXT,
|
112 |
embedding vector(1024),
|
113 |
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
114 |
);
|
115 |
-
CREATE INDEX IF NOT EXISTS idx_query_crc32 ON {query_cache_table} (query_crc32);
|
116 |
-
CREATE INDEX IF NOT EXISTS idx_created_at ON {query_cache_table} (created_at);
|
117 |
""")
|
118 |
|
119 |
conn.commit()
|
@@ -146,7 +146,7 @@ def get_movies_without_embeddings():
|
|
146 |
try:
|
147 |
with conn.cursor() as cur:
|
148 |
# Получаем список ID фильмов, которые уже есть в таблице эмбеддингов
|
149 |
-
cur.execute(f"SELECT movie_id FROM {embeddings_table}")
|
150 |
existing_ids = {row[0] for row in cur.fetchall()}
|
151 |
|
152 |
# Получаем список всех фильмов из таблицы Movies с подготовленной строкой
|
@@ -157,7 +157,7 @@ def get_movies_without_embeddings():
|
|
157 |
'\\nЖанры: ' || (SELECT string_agg(genre->>'name', ', ') FROM jsonb_array_elements(data->'genres') AS genre) ||
|
158 |
'\\nОписание: ' || COALESCE(data->>'description', '')
|
159 |
AS prepared_string
|
160 |
-
FROM {movies_table}
|
161 |
""")
|
162 |
all_movies = cur.fetchall()
|
163 |
|
@@ -178,7 +178,7 @@ def get_embedding_from_db(conn, table_name, crc32_column, crc32_value, model_nam
|
|
178 |
"""Получает эмбеддинг из базы данных."""
|
179 |
try:
|
180 |
with conn.cursor() as cur:
|
181 |
-
cur.execute(f"SELECT embedding FROM {table_name} WHERE {crc32_column} = %s AND model_name = %s",
|
182 |
(crc32_value, model_name))
|
183 |
result = cur.fetchone()
|
184 |
if result and result[0]:
|
@@ -195,7 +195,7 @@ def insert_embedding(conn, table_name, movie_id, embedding_crc32, string_crc32,
|
|
195 |
normalized_embedding = normalize(embedding.reshape(1, -1))[0]
|
196 |
with conn.cursor() as cur:
|
197 |
cur.execute(f"""
|
198 |
-
INSERT INTO {table_name}
|
199 |
(movie_id, embedding_crc32, string_crc32, model_name, embedding)
|
200 |
VALUES (%s, %s, %s, %s, %s)
|
201 |
ON CONFLICT (movie_id) DO NOTHING
|
@@ -270,7 +270,7 @@ def process_movies():
|
|
270 |
|
271 |
if not batch:
|
272 |
break
|
273 |
-
|
274 |
executor.submit(process_batch, batch)
|
275 |
logging.info(f"Отправлен на обработку пакет из {len(batch)} фильмов.")
|
276 |
except Exception as e:
|
@@ -291,7 +291,7 @@ def get_movie_data_from_db(conn, movie_ids):
|
|
291 |
'\\nЖанры: ' || (SELECT string_agg(genre->>'name', ', ') FROM jsonb_array_elements(data->'genres') AS genre) ||
|
292 |
'\\nОписание: ' || COALESCE(data->>'description', '')
|
293 |
AS prepared_string
|
294 |
-
FROM {movies_table}
|
295 |
WHERE id IN %s
|
296 |
""", (tuple(movie_ids),))
|
297 |
for movie_id, movie_data, prepared_string in cur.fetchall():
|
@@ -303,7 +303,7 @@ def get_movie_data_from_db(conn, movie_ids):
|
|
303 |
def rerank_with_api(query, results, top_k):
|
304 |
"""Переранжирует результаты с помощью Jina AI Reranker API."""
|
305 |
logging.info(f"Начало переранжирования для запроса: '{query}'")
|
306 |
-
|
307 |
# Получаем данные фильмов из БД
|
308 |
conn = get_db_connection()
|
309 |
movie_ids = [movie_id for movie_id, _ in results]
|
@@ -372,7 +372,7 @@ def search_movies(query, top_k=25):
|
|
372 |
try:
|
373 |
with conn.cursor() as cur:
|
374 |
cur.execute(f"""
|
375 |
-
INSERT INTO {query_cache_table} (query_crc32, query, model_name, embedding)
|
376 |
VALUES (%s, %s, %s, %s)
|
377 |
ON CONFLICT (query_crc32) DO NOTHING
|
378 |
""", (query_crc32, query, model_name, query_embedding.tolist()))
|
@@ -388,11 +388,11 @@ def search_movies(query, top_k=25):
|
|
388 |
cur.execute(f"""
|
389 |
WITH query_embedding AS (
|
390 |
SELECT embedding
|
391 |
-
FROM {query_cache_table}
|
392 |
WHERE query_crc32 = %s
|
393 |
)
|
394 |
SELECT m.movie_id, 1 - (m.embedding <=> (SELECT embedding FROM query_embedding)) as similarity
|
395 |
-
FROM {embeddings_table} m, query_embedding
|
396 |
ORDER BY similarity DESC
|
397 |
LIMIT %s
|
398 |
""", (query_crc32, int(top_k * 2)))
|
|
|
93 |
|
94 |
# Создаем таблицу для хранения эмбеддингов фильмов
|
95 |
cur.execute(f"""
|
96 |
+
CREATE TABLE IF NOT EXISTS "{embeddings_table}" (
|
97 |
movie_id INTEGER PRIMARY KEY,
|
98 |
embedding_crc32 BIGINT,
|
99 |
string_crc32 BIGINT,
|
100 |
model_name TEXT,
|
101 |
embedding vector(1024)
|
102 |
);
|
103 |
+
CREATE INDEX IF NOT EXISTS idx_string_crc32 ON "{embeddings_table}" (string_crc32);
|
104 |
""")
|
105 |
|
106 |
# Создаем таблицу для кэширования запросов
|
107 |
cur.execute(f"""
|
108 |
+
CREATE TABLE IF NOT EXISTS "{query_cache_table}" (
|
109 |
query_crc32 BIGINT PRIMARY KEY,
|
110 |
query TEXT,
|
111 |
model_name TEXT,
|
112 |
embedding vector(1024),
|
113 |
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
114 |
);
|
115 |
+
CREATE INDEX IF NOT EXISTS idx_query_crc32 ON "{query_cache_table}" (query_crc32);
|
116 |
+
CREATE INDEX IF NOT EXISTS idx_created_at ON "{query_cache_table}" (created_at);
|
117 |
""")
|
118 |
|
119 |
conn.commit()
|
|
|
146 |
try:
|
147 |
with conn.cursor() as cur:
|
148 |
# Получаем список ID фильмов, которые уже есть в таблице эмбеддингов
|
149 |
+
cur.execute(f"SELECT movie_id FROM \"{embeddings_table}\"")
|
150 |
existing_ids = {row[0] for row in cur.fetchall()}
|
151 |
|
152 |
# Получаем список всех фильмов из таблицы Movies с подготовленной строкой
|
|
|
157 |
'\\nЖанры: ' || (SELECT string_agg(genre->>'name', ', ') FROM jsonb_array_elements(data->'genres') AS genre) ||
|
158 |
'\\nОписание: ' || COALESCE(data->>'description', '')
|
159 |
AS prepared_string
|
160 |
+
FROM "{movies_table}"
|
161 |
""")
|
162 |
all_movies = cur.fetchall()
|
163 |
|
|
|
178 |
"""Получает эмбеддинг из базы данных."""
|
179 |
try:
|
180 |
with conn.cursor() as cur:
|
181 |
+
cur.execute(f"SELECT embedding FROM \"{table_name}\" WHERE \"{crc32_column}\" = %s AND model_name = %s",
|
182 |
(crc32_value, model_name))
|
183 |
result = cur.fetchone()
|
184 |
if result and result[0]:
|
|
|
195 |
normalized_embedding = normalize(embedding.reshape(1, -1))[0]
|
196 |
with conn.cursor() as cur:
|
197 |
cur.execute(f"""
|
198 |
+
INSERT INTO "{table_name}"
|
199 |
(movie_id, embedding_crc32, string_crc32, model_name, embedding)
|
200 |
VALUES (%s, %s, %s, %s, %s)
|
201 |
ON CONFLICT (movie_id) DO NOTHING
|
|
|
270 |
|
271 |
if not batch:
|
272 |
break
|
273 |
+
|
274 |
executor.submit(process_batch, batch)
|
275 |
logging.info(f"Отправлен на обработку пакет из {len(batch)} фильмов.")
|
276 |
except Exception as e:
|
|
|
291 |
'\\nЖанры: ' || (SELECT string_agg(genre->>'name', ', ') FROM jsonb_array_elements(data->'genres') AS genre) ||
|
292 |
'\\nОписание: ' || COALESCE(data->>'description', '')
|
293 |
AS prepared_string
|
294 |
+
FROM "{movies_table}"
|
295 |
WHERE id IN %s
|
296 |
""", (tuple(movie_ids),))
|
297 |
for movie_id, movie_data, prepared_string in cur.fetchall():
|
|
|
303 |
def rerank_with_api(query, results, top_k):
|
304 |
"""Переранжирует результаты с помощью Jina AI Reranker API."""
|
305 |
logging.info(f"Начало переранжирования для запроса: '{query}'")
|
306 |
+
|
307 |
# Получаем данные фильмов из БД
|
308 |
conn = get_db_connection()
|
309 |
movie_ids = [movie_id for movie_id, _ in results]
|
|
|
372 |
try:
|
373 |
with conn.cursor() as cur:
|
374 |
cur.execute(f"""
|
375 |
+
INSERT INTO "{query_cache_table}" (query_crc32, query, model_name, embedding)
|
376 |
VALUES (%s, %s, %s, %s)
|
377 |
ON CONFLICT (query_crc32) DO NOTHING
|
378 |
""", (query_crc32, query, model_name, query_embedding.tolist()))
|
|
|
388 |
cur.execute(f"""
|
389 |
WITH query_embedding AS (
|
390 |
SELECT embedding
|
391 |
+
FROM "{query_cache_table}"
|
392 |
WHERE query_crc32 = %s
|
393 |
)
|
394 |
SELECT m.movie_id, 1 - (m.embedding <=> (SELECT embedding FROM query_embedding)) as similarity
|
395 |
+
FROM "{embeddings_table}" m, query_embedding
|
396 |
ORDER BY similarity DESC
|
397 |
LIMIT %s
|
398 |
""", (query_crc32, int(top_k * 2)))
|