Spaces:
Running
Running
Upload 4 files
Browse files- .gitattributes +1 -0
- app.py +89 -71
- header-image-1.png +3 -0
- query.py +66 -16
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
header-image-2.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
header-image-2.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
header-image-1.png filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
from omegaconf import OmegaConf
|
2 |
from query import VectaraQuery
|
3 |
import os
|
4 |
-
import requests
|
5 |
|
6 |
import streamlit as st
|
7 |
from PIL import Image
|
|
|
|
|
|
|
8 |
|
9 |
def inject_custom_css():
|
10 |
st.markdown(
|
@@ -17,12 +19,18 @@ def inject_custom_css():
|
|
17 |
color: #333;
|
18 |
}
|
19 |
body {
|
|
|
|
|
|
|
20 |
padding-top: 0px;
|
|
|
21 |
}
|
22 |
.stApp {
|
23 |
-
padding-top:
|
|
|
24 |
}
|
25 |
.stButton>button {
|
|
|
26 |
background-color: #4CAF50;
|
27 |
color: white;
|
28 |
padding: 10px 24px;
|
@@ -61,80 +69,90 @@ def inject_custom_css():
|
|
61 |
.css-1d391kg { /* This targets the sidebar headings */
|
62 |
color: #333 !important;
|
63 |
}
|
64 |
-
.form-container {
|
65 |
-
display: flex;
|
66 |
-
justify-content: space-between;
|
67 |
-
align-items: center;
|
68 |
-
}
|
69 |
-
.form-container .stTextInput {
|
70 |
-
flex: 1;
|
71 |
-
}
|
72 |
-
.form-container .stButton {
|
73 |
-
margin-left: 10px;
|
74 |
-
}
|
75 |
</style>
|
76 |
""",
|
77 |
unsafe_allow_html=True
|
78 |
)
|
79 |
|
80 |
-
def
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
"
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
if __name__ == "__main__":
|
139 |
-
|
140 |
|
|
|
1 |
from omegaconf import OmegaConf
|
2 |
from query import VectaraQuery
|
3 |
import os
|
|
|
4 |
|
5 |
import streamlit as st
|
6 |
from PIL import Image
|
7 |
+
import concurrent.futures
|
8 |
+
|
9 |
+
SCORE_THRESHOLD = 0.7
|
10 |
|
11 |
def inject_custom_css():
|
12 |
st.markdown(
|
|
|
19 |
color: #333;
|
20 |
}
|
21 |
body {
|
22 |
+
font-family: 'Roboto', sans-serif;
|
23 |
+
background-color: #f5f5f5;
|
24 |
+
color: #333;
|
25 |
padding-top: 0px;
|
26 |
+
margin-top: 0px;
|
27 |
}
|
28 |
.stApp {
|
29 |
+
padding-top: 0px;
|
30 |
+
margin-top: 0px;
|
31 |
}
|
32 |
.stButton>button {
|
33 |
+
margin-top: 25px;
|
34 |
background-color: #4CAF50;
|
35 |
color: white;
|
36 |
padding: 10px 24px;
|
|
|
69 |
.css-1d391kg { /* This targets the sidebar headings */
|
70 |
color: #333 !important;
|
71 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
</style>
|
73 |
""",
|
74 |
unsafe_allow_html=True
|
75 |
)
|
76 |
|
77 |
+
def fetch_summary(vq, matching_text, doc_id):
|
78 |
+
return vq.get_summary(matching_text, doc_id)
|
79 |
+
|
80 |
+
def launch_app():
|
81 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
82 |
+
|
83 |
+
if 'cfg' not in st.session_state:
|
84 |
+
cfg = OmegaConf.create({
|
85 |
+
'customer_id': str(os.environ['VECTARA_CUSTOMER_ID']),
|
86 |
+
'corpus_id': str(os.environ['VECTARA_CORPUS_ID']),
|
87 |
+
'api_key': str(os.environ['VECTARA_API_KEY']),
|
88 |
+
'streaming': False
|
89 |
+
})
|
90 |
+
st.session_state.cfg = cfg
|
91 |
+
st.session_state.vq = VectaraQuery(cfg.api_key, cfg.customer_id, [cfg.corpus_id])
|
92 |
+
|
93 |
+
cfg = st.session_state.cfg
|
94 |
+
vq = st.session_state.vq
|
95 |
+
st.set_page_config(page_title="Media Demo", layout="wide")
|
96 |
+
inject_custom_css()
|
97 |
+
|
98 |
+
header_image = Image.open('header-image-2.png')
|
99 |
+
cropped_image = header_image.crop((0, 0, header_image.width, 150))
|
100 |
+
st.image(cropped_image, use_column_width=True)
|
101 |
+
|
102 |
+
# left side content
|
103 |
+
with st.sidebar:
|
104 |
+
image = Image.open('vectara-logo.png')
|
105 |
+
st.markdown("## Welcome to Media Demo\n\n"
|
106 |
+
"This demo uses Vectara to find the movie where a quote is from.\n\n"
|
107 |
+
"Covers movies from this [playlist](https://www.youtube.com/playlist?list=PLHPTxTxtC0ibVZrT2_WKWUl2SAxsKuKwx) of free movies.")
|
108 |
+
|
109 |
+
st.markdown("---")
|
110 |
+
st.markdown(
|
111 |
+
"## How this works?\n"
|
112 |
+
"This app was built with [Vectara](https://vectara.com).\n"
|
113 |
+
)
|
114 |
+
st.markdown("---")
|
115 |
+
st.image(image, width=250)
|
116 |
+
|
117 |
+
st.markdown("<center> <h3>\"Where did I hear that line?\"</h3> </center>", unsafe_allow_html=True)
|
118 |
+
|
119 |
+
_, q_col, _ = st.columns([1, 4, 1])
|
120 |
+
with q_col:
|
121 |
+
quote = st.text_input("quote", label_visibility="hidden", placeholder="Enter a quote from a movie.")
|
122 |
+
prev_quote = st.session_state.get('prev_quote', '')
|
123 |
+
if quote != prev_quote:
|
124 |
+
st.session_state.quote = quote
|
125 |
+
st.session_state.prev_quote = quote
|
126 |
+
st.session_state.movie_name, st.session_state.match_url, st.session_state.score, doc_id, matching_text = vq.submit_query(quote)
|
127 |
+
if st.session_state.score < SCORE_THRESHOLD:
|
128 |
+
st.session_state.movie_name = None
|
129 |
+
else:
|
130 |
+
future = executor.submit(fetch_summary, vq, matching_text, doc_id)
|
131 |
+
st.session_state.summary_future = future
|
132 |
+
|
133 |
+
if 'score' in st.session_state and st.session_state.score:
|
134 |
+
if st.session_state.movie_name is None:
|
135 |
+
st.write("Sorry, I couldn't find a match for that quote. Please try another one.")
|
136 |
+
else:
|
137 |
+
video_url, start_time = st.session_state.match_url.split('&t=')
|
138 |
+
video_url = f"{video_url}&cc_load_policy=1"
|
139 |
+
start_time = start_time[:-1] # remove the trailing 's'
|
140 |
+
|
141 |
+
_, video_col, summary_col = st.columns([1, 4, 3])
|
142 |
+
with video_col:
|
143 |
+
st.video(video_url, start_time=int(float(start_time)))
|
144 |
+
with summary_col:
|
145 |
+
# Display the summary when it's ready
|
146 |
+
if 'summary_future' in st.session_state:
|
147 |
+
if st.session_state.summary_future.done():
|
148 |
+
st.markdown("**Summary:**")
|
149 |
+
st.session_state.summary = st.session_state.summary_future.result()
|
150 |
+
st.markdown(st.session_state.summary)
|
151 |
+
|
152 |
+
if not st.session_state.summary_future.done():
|
153 |
+
st.rerun()
|
154 |
+
|
155 |
|
156 |
if __name__ == "__main__":
|
157 |
+
launch_app()
|
158 |
|
header-image-1.png
ADDED
Git LFS Details
|
query.py
CHANGED
@@ -1,36 +1,67 @@
|
|
1 |
import requests
|
2 |
import json
|
3 |
-
import re
|
4 |
|
5 |
class VectaraQuery():
|
6 |
-
def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str]
|
7 |
self.customer_id = customer_id
|
8 |
self.corpus_ids = corpus_ids
|
9 |
self.api_key = api_key
|
10 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
def get_body(self, query_str: str):
|
13 |
corpora_key_list = [{
|
14 |
-
'
|
15 |
} for corpus_id in self.corpus_ids
|
16 |
]
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
'query': [
|
19 |
{
|
20 |
'query': query_str,
|
21 |
'start': 0,
|
22 |
-
'numResults':
|
23 |
'corpusKey': corpora_key_list,
|
24 |
-
'
|
25 |
-
'sentences_before':
|
26 |
-
'sentences_after':
|
27 |
-
'start_tag':
|
28 |
-
'end_tag':
|
29 |
},
|
30 |
-
'rerankingConfig': { 'rerankerId': 272725719 }
|
31 |
}
|
32 |
]
|
33 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def get_headers(self):
|
36 |
return {
|
@@ -44,7 +75,7 @@ class VectaraQuery():
|
|
44 |
def submit_query(self, query_str: str):
|
45 |
|
46 |
endpoint = "https://api.vectara.io/v1/query"
|
47 |
-
body = self.get_body(query_str)
|
48 |
|
49 |
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
|
50 |
if response.status_code != 200:
|
@@ -56,11 +87,12 @@ class VectaraQuery():
|
|
56 |
responses = res['responseSet'][0]['response'][:top_k]
|
57 |
documents = res['responseSet'][0]['document']
|
58 |
|
59 |
-
|
60 |
metadatas = []
|
61 |
for x in responses:
|
62 |
md = {m["name"]: m["value"] for m in x["metadata"]}
|
63 |
doc_num = x["documentIndex"]
|
|
|
|
|
64 |
doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
|
65 |
md.update(doc_md)
|
66 |
metadatas.append(md)
|
@@ -68,6 +100,24 @@ class VectaraQuery():
|
|
68 |
movie_title = metadatas[0].get("doc_title", None)
|
69 |
snippet_url = metadatas[0].get("url", None)
|
70 |
score = responses[0]["score"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
return
|
73 |
|
|
|
1 |
import requests
|
2 |
import json
|
|
|
3 |
|
4 |
class VectaraQuery():
|
5 |
+
def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str]):
|
6 |
self.customer_id = customer_id
|
7 |
self.corpus_ids = corpus_ids
|
8 |
self.api_key = api_key
|
9 |
+
self.START_TAG = "<em_start>"
|
10 |
+
self.END_TAG = "<em_end>"
|
11 |
+
self.prompt_name = "vectara-summary-ext-24-05-med"
|
12 |
+
self.prompt_text = '''
|
13 |
+
[{"role": "system", "content": "Follow these detailed step-by-step instructions, your task is to generate an accurate and coherent summary of the first search result.
|
14 |
+
- You will receive a single search result enclosed in triple quotes, which includes part of a script from a movie.
|
15 |
+
- the search result can be a part of a larger movie scence, and may be incomplete.
|
16 |
+
- the text is a sequence of subtitles from the movie itself.
|
17 |
+
- Base your summary only on the information provided in the search result, do not use any other sources.
|
18 |
+
- Do no include the word summary in your response, just the summary itself.
|
19 |
+
- Summarize the scene including who the characters are, what they do and any other important detail."},
|
20 |
+
{"role": "user", "content": "#foreach ($qResult in $vectaraQueryResults) Search Result $esc.java($foreach.index + 1): \'\'\'$esc.java($qResult.text())\'\'\'.#end"}
|
21 |
+
]
|
22 |
+
'''
|
23 |
|
24 |
+
def get_body(self, query_str: str, filter: str = None, summarize: bool = True):
|
25 |
corpora_key_list = [{
|
26 |
+
'customerId': self.customer_id, 'corpusId': corpus_id, 'lexicalInterpolationConfig': {'lambda': 0.005}
|
27 |
} for corpus_id in self.corpus_ids
|
28 |
]
|
29 |
+
if filter:
|
30 |
+
for key in corpora_key_list:
|
31 |
+
key['filter'] = filter
|
32 |
+
|
33 |
+
sent_before = 15 if summarize else 1
|
34 |
+
sent_after = 15 if summarize else 1
|
35 |
+
body = {
|
36 |
'query': [
|
37 |
{
|
38 |
'query': query_str,
|
39 |
'start': 0,
|
40 |
+
'numResults': 50,
|
41 |
'corpusKey': corpora_key_list,
|
42 |
+
'contextConfig': {
|
43 |
+
'sentences_before': sent_before,
|
44 |
+
'sentences_after': sent_after,
|
45 |
+
'start_tag': self.START_TAG,
|
46 |
+
'end_tag': self.END_TAG
|
47 |
},
|
|
|
48 |
}
|
49 |
]
|
50 |
}
|
51 |
+
if summarize:
|
52 |
+
body['query'][0]['summary'] = [
|
53 |
+
{
|
54 |
+
'responseLang': 'eng',
|
55 |
+
'maxSummarizedResults': 1,
|
56 |
+
'summarizerPromptName': self.prompt_name,
|
57 |
+
'promptText': self.prompt_text
|
58 |
+
}
|
59 |
+
]
|
60 |
+
else:
|
61 |
+
body['query'][0]['rerankingConfig'] = { 'rerankerId': 272725719 } # rerank only in main query, not when summarizing
|
62 |
+
|
63 |
+
return body
|
64 |
+
|
65 |
|
66 |
def get_headers(self):
|
67 |
return {
|
|
|
75 |
def submit_query(self, query_str: str):
|
76 |
|
77 |
endpoint = "https://api.vectara.io/v1/query"
|
78 |
+
body = self.get_body(query_str, filter=None, summarize=False)
|
79 |
|
80 |
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
|
81 |
if response.status_code != 200:
|
|
|
87 |
responses = res['responseSet'][0]['response'][:top_k]
|
88 |
documents = res['responseSet'][0]['document']
|
89 |
|
|
|
90 |
metadatas = []
|
91 |
for x in responses:
|
92 |
md = {m["name"]: m["value"] for m in x["metadata"]}
|
93 |
doc_num = x["documentIndex"]
|
94 |
+
doc_id = documents[doc_num]["id"]
|
95 |
+
md['doc_id'] = doc_id
|
96 |
doc_md = {f'doc_{m["name"]}': m["value"] for m in documents[doc_num]["metadata"]}
|
97 |
md.update(doc_md)
|
98 |
metadatas.append(md)
|
|
|
100 |
movie_title = metadatas[0].get("doc_title", None)
|
101 |
snippet_url = metadatas[0].get("url", None)
|
102 |
score = responses[0]["score"]
|
103 |
+
doc_id = metadatas[0]["doc_id"]
|
104 |
+
matching_text = responses[0]["text"].split(self.START_TAG)[1].split(self.END_TAG)[0].strip()
|
105 |
+
|
106 |
+
return movie_title, snippet_url, score, doc_id, matching_text
|
107 |
+
|
108 |
+
def get_summary(self, query_str: str, doc_id: str):
|
109 |
+
|
110 |
+
endpoint = "https://api.vectara.io/v1/query"
|
111 |
+
filter = f"doc.id == '{doc_id}'"
|
112 |
+
body = self.get_body(query_str, filter, summarize=True)
|
113 |
+
|
114 |
+
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
|
115 |
+
if response.status_code != 200:
|
116 |
+
print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
|
117 |
+
return "Sorry, something went wrong in my brain. Please try again later."
|
118 |
+
|
119 |
+
res = response.json()
|
120 |
+
summary = res['responseSet'][0]['summary'][0]['text']
|
121 |
|
122 |
+
return summary
|
123 |
|