add normal files
Browse files- app.py +78 -0
- embeddings.py +143 -0
- log_generation.py +116 -0
- models.py +111 -0
- models.yml +1 -0
- reference_embeddings.py +44 -0
- requirements.txt +8 -0
- tasks.py +93 -0
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
from models import CLIP, T2T
|
6 |
+
from tasks import Summary, VideoSearch
|
7 |
+
from log_generation import download_youtube, extract_video_frames, generate_log
|
8 |
+
|
9 |
+
|
10 |
+
st.set_page_config(page_title="Socratic Models Demo", page_icon="", layout="wide")
|
11 |
+
st.title("Socratic Models Demo")
|
12 |
+
|
13 |
+
if "vlm" not in st.session_state:
|
14 |
+
st.session_state.vlm = CLIP()
|
15 |
+
|
16 |
+
if "llm" not in st.session_state:
|
17 |
+
st.session_state.llm = T2T()
|
18 |
+
|
19 |
+
|
20 |
+
col1, col2, _ = st.columns([2, 2, 3])
|
21 |
+
with col1:
|
22 |
+
url = st.text_input(
|
23 |
+
"YouTube Video URL", "https://www.youtube.com/watch?v=tQG6jYy9xto"
|
24 |
+
)
|
25 |
+
video_id = url.split("watch?v=")[-1]
|
26 |
+
|
27 |
+
with col2:
|
28 |
+
st.video(url)
|
29 |
+
|
30 |
+
if not os.path.exists(f"{video_id}"):
|
31 |
+
st.write("Video not found locally. Downloading may take several minutes. Continue?")
|
32 |
+
|
33 |
+
click = st.button("Download")
|
34 |
+
if not click:
|
35 |
+
st.stop()
|
36 |
+
|
37 |
+
st.success("Downloading...")
|
38 |
+
download_youtube(url)
|
39 |
+
st.write("Extracting frames...")
|
40 |
+
extract_video_frames(
|
41 |
+
f"{video_id}/{video_id}.mp4", dims=(600, 400), sampling_rate=100
|
42 |
+
)
|
43 |
+
st.write("Generating log...")
|
44 |
+
generate_log(
|
45 |
+
f"{video_id}/history.txt",
|
46 |
+
f"{video_id}",
|
47 |
+
st.session_state.vlm,
|
48 |
+
st.session_state.llm,
|
49 |
+
)
|
50 |
+
refresh = st.button("Click to refresh")
|
51 |
+
if not refresh:
|
52 |
+
st.stop()
|
53 |
+
|
54 |
+
|
55 |
+
search = VideoSearch(video_id, st.session_state.vlm)
|
56 |
+
|
57 |
+
st.title("Video Search")
|
58 |
+
query = st.text_input("Search Query", "working at my computer")
|
59 |
+
images = search.search_engine(query)
|
60 |
+
with st.expander(label="See results"):
|
61 |
+
for image in images:
|
62 |
+
st.image(image)
|
63 |
+
|
64 |
+
|
65 |
+
st.title("Event Summaries")
|
66 |
+
summ = Summary(video_id, st.session_state.llm)
|
67 |
+
summaries = summ.generate_summaries()
|
68 |
+
with st.expander(label="See results"):
|
69 |
+
for (prompt, result) in summaries:
|
70 |
+
st.markdown("*Event Log*")
|
71 |
+
st.write(prompt)
|
72 |
+
st.markdown("*Summary*")
|
73 |
+
st.write(result)
|
74 |
+
|
75 |
+
|
76 |
+
st.title("Video Event Log")
|
77 |
+
with open(f"{video_id}/history.txt", "r") as f:
|
78 |
+
st.text(f.read())
|
embeddings.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
import faiss
|
5 |
+
import torch
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
|
10 |
+
|
11 |
+
class FaissIndex:
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
embedding_size=None,
|
15 |
+
faiss_index_location=None,
|
16 |
+
indexer=faiss.IndexFlatIP,
|
17 |
+
):
|
18 |
+
|
19 |
+
if embedding_size or faiss_index_location:
|
20 |
+
self.embedding_size = embedding_size
|
21 |
+
else:
|
22 |
+
raise ValueError("Must provide embedding_size")
|
23 |
+
|
24 |
+
self.faiss_index_location = faiss_index_location
|
25 |
+
if faiss_index_location and os.path.exists(faiss_index_location):
|
26 |
+
self.index = faiss.read_index(faiss_index_location)
|
27 |
+
logger.info(f"Setting embedding size ({self.index.d}) to match saved index")
|
28 |
+
self.embedding_size = self.index.d
|
29 |
+
if os.path.exists(faiss_index_location + ".ids"):
|
30 |
+
with open(faiss_index_location + ".ids") as f:
|
31 |
+
self.id_list = f.read().split("\n")
|
32 |
+
elif self.index.ntotal > 0:
|
33 |
+
raise ValueError("Index file exists but ids file does not")
|
34 |
+
else:
|
35 |
+
self.id_list = []
|
36 |
+
|
37 |
+
else:
|
38 |
+
os.makedirs(os.path.dirname(faiss_index_location), exist_ok=True)
|
39 |
+
self.index = None
|
40 |
+
self.indexer = indexer
|
41 |
+
self.id_list = []
|
42 |
+
|
43 |
+
def faiss_init(self):
|
44 |
+
|
45 |
+
index = self.indexer(self.embedding_size)
|
46 |
+
if self.faiss_index_location:
|
47 |
+
faiss.write_index(index, self.faiss_index_location)
|
48 |
+
self.index = index
|
49 |
+
|
50 |
+
def add(self, inputs, ids, normalize=True):
|
51 |
+
|
52 |
+
if not self.index:
|
53 |
+
self.faiss_init()
|
54 |
+
|
55 |
+
if normalize:
|
56 |
+
faiss.normalize_L2(inputs)
|
57 |
+
self.index.add(inputs)
|
58 |
+
self.id_list.extend(ids)
|
59 |
+
|
60 |
+
faiss.write_index(self.index, self.faiss_index_location)
|
61 |
+
with open(self.faiss_index_location + ".ids", "a") as f:
|
62 |
+
f.write("\n".join(ids) + "\n")
|
63 |
+
|
64 |
+
def search(self, embedding, k=10, normalize=True):
|
65 |
+
|
66 |
+
if len(embedding.shape):
|
67 |
+
embedding = embedding.reshape(1, -1)
|
68 |
+
if normalize:
|
69 |
+
faiss.normalize_L2(embedding)
|
70 |
+
D, I = self.index.search(embedding, k)
|
71 |
+
labels = [self.id_list[i] for i in I.squeeze()]
|
72 |
+
return D, I, labels
|
73 |
+
|
74 |
+
def reset(self):
|
75 |
+
|
76 |
+
if self.index:
|
77 |
+
self.index.reset()
|
78 |
+
self.id_list = []
|
79 |
+
try:
|
80 |
+
os.remove(self.faiss_index_location)
|
81 |
+
os.remove(self.faiss_index_location + ".ids")
|
82 |
+
except FileNotFoundError:
|
83 |
+
pass
|
84 |
+
|
85 |
+
def __len__(self):
|
86 |
+
if self.index:
|
87 |
+
return self.index.ntotal
|
88 |
+
return 0
|
89 |
+
|
90 |
+
|
91 |
+
class VectorSearch:
|
92 |
+
def __init__(self):
|
93 |
+
self.places = self.load("places")
|
94 |
+
self.objects = self.load("objects")
|
95 |
+
|
96 |
+
def load(self, index_name):
|
97 |
+
return FaissIndex(
|
98 |
+
faiss_index_location=f"faiss_indices/{index_name}.index",
|
99 |
+
)
|
100 |
+
|
101 |
+
def top_places(self, query_vec, k=5):
|
102 |
+
if isinstance(query_vec, torch.Tensor):
|
103 |
+
query_vec = query_vec.detach().numpy()
|
104 |
+
*_, results = self.places.search(query_vec, k=k)
|
105 |
+
return results
|
106 |
+
|
107 |
+
def top_objects(self, query_vec, k=5):
|
108 |
+
if isinstance(query_vec, torch.Tensor):
|
109 |
+
query_vec = query_vec.detach().numpy()
|
110 |
+
*_, results = self.objects.search(query_vec, k=k)
|
111 |
+
return results
|
112 |
+
|
113 |
+
def prompt_activities(self, query_vec, k=5, one_shot=False):
|
114 |
+
places = self.top_places(query_vec, k=k)
|
115 |
+
objects = self.top_objects(query_vec, k=k)
|
116 |
+
place_str = f"Places: {', '.join(places)}. "
|
117 |
+
object_str = f"Objects: {', '.join(objects)}. "
|
118 |
+
|
119 |
+
act_str = "I might be doing these 3 activities: "
|
120 |
+
|
121 |
+
zs = place_str + object_str + act_str
|
122 |
+
|
123 |
+
example = (
|
124 |
+
"Places: kitchen. Objects: coffee maker. "
|
125 |
+
f"{act_str}: eating, making breakfast, grinding coffee.\n "
|
126 |
+
)
|
127 |
+
fs = example + place_str + object_str + act_str
|
128 |
+
if one_shot:
|
129 |
+
return (zs, fs)
|
130 |
+
|
131 |
+
return zs, places, objects
|
132 |
+
|
133 |
+
def prompt_summary(self, state_history: list, k=5):
|
134 |
+
|
135 |
+
rec_strings = ["Event log:"]
|
136 |
+
for rec in state_history:
|
137 |
+
rec_strings.append(
|
138 |
+
f"Places: {', '.join(rec.places)}. "
|
139 |
+
f"Objects: {', '.join(rec.objects)}. "
|
140 |
+
f"Activities: {', '.join(rec.activities)} "
|
141 |
+
)
|
142 |
+
question = "How would you summarize these events in a few full sentences? "
|
143 |
+
return "\n".join(rec_strings) + "\n" + question
|
log_generation.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import string
|
3 |
+
from datetime import datetime
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import cv2
|
7 |
+
import yt_dlp
|
8 |
+
from nltk.tokenize import sent_tokenize
|
9 |
+
from tqdm import tqdm
|
10 |
+
|
11 |
+
from embeddings import VectorSearch, FaissIndex
|
12 |
+
|
13 |
+
|
14 |
+
def download_youtube(url, parent_dir="."):
|
15 |
+
def extract_youtube_id(url):
|
16 |
+
return url.split("watch?v=")[-1]
|
17 |
+
|
18 |
+
video_path = extract_youtube_id(url)
|
19 |
+
ydl_opts = {
|
20 |
+
"format": "mp4",
|
21 |
+
"outtmpl": f"{parent_dir}/{video_path}/{video_path}.%(ext)s",
|
22 |
+
}
|
23 |
+
|
24 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
25 |
+
error_code = ydl.download([url])
|
26 |
+
|
27 |
+
return error_code
|
28 |
+
|
29 |
+
|
30 |
+
def extract_video_frames(video_path, dims=(600, 400), sampling_rate=100):
|
31 |
+
video_dir = str(Path(video_path).parent)
|
32 |
+
video_name = str(Path(video_path).stem)
|
33 |
+
cap = cv2.VideoCapture(video_path)
|
34 |
+
|
35 |
+
i = 0
|
36 |
+
while cap.isOpened():
|
37 |
+
ret, frame = cap.read()
|
38 |
+
|
39 |
+
if not ret:
|
40 |
+
break
|
41 |
+
|
42 |
+
if i % sampling_rate == 0:
|
43 |
+
print(i)
|
44 |
+
|
45 |
+
frame = cv2.resize(frame, dims, fx=0, fy=0, interpolation=cv2.INTER_CUBIC)
|
46 |
+
timestamp = datetime.utcnow().timestamp()
|
47 |
+
cv2.imwrite(f"{video_dir}/{video_name}_{timestamp}_{i}.jpg", frame)
|
48 |
+
|
49 |
+
i += 1
|
50 |
+
|
51 |
+
cap.release()
|
52 |
+
cv2.destroyAllWindows()
|
53 |
+
|
54 |
+
|
55 |
+
def strip_punctuation(text):
|
56 |
+
return text.translate(str.maketrans("", "", string.punctuation))
|
57 |
+
|
58 |
+
|
59 |
+
def clean_response(act_text):
|
60 |
+
|
61 |
+
act_text = act_text.lower().replace("\n", "")
|
62 |
+
text_split = act_text.split("places")[0]
|
63 |
+
if not text_split:
|
64 |
+
text_split = act_text
|
65 |
+
|
66 |
+
try:
|
67 |
+
first_sent = sent_tokenize(text_split)[0]
|
68 |
+
except:
|
69 |
+
first_sent = text_split
|
70 |
+
|
71 |
+
list_split = first_sent.split(",")
|
72 |
+
no_spaces = list(map(str.strip, list_split))
|
73 |
+
|
74 |
+
return list(map(strip_punctuation, no_spaces))[:3]
|
75 |
+
|
76 |
+
|
77 |
+
def log_activity_from_image(image_file, frame, vlm, llm, vs, fi):
|
78 |
+
img_embed = vlm.get_image_emb(image_file)
|
79 |
+
fi.add(img_embed, [frame])
|
80 |
+
zs, places, objects = vs.prompt_activities(img_embed, 3)
|
81 |
+
|
82 |
+
# kwargs = {
|
83 |
+
# "top_p": 0.9,
|
84 |
+
# "temperature": 1.2,
|
85 |
+
# "max_new_tokens": 20,
|
86 |
+
# "return_full_text": False,
|
87 |
+
# }
|
88 |
+
activities_raw = llm(zs)
|
89 |
+
act_text = activities_raw[0]["generated_text"].lower()
|
90 |
+
activities_clean = clean_response(act_text)
|
91 |
+
|
92 |
+
log = (
|
93 |
+
f"{frame}:"
|
94 |
+
f"Places: {', '.join(places)}. "
|
95 |
+
f"Objects: {', '.join(objects)}. "
|
96 |
+
f"Activities: {', '.join(activities_clean)}"
|
97 |
+
)
|
98 |
+
# log = f'{zs} {", ".join(activities_clean)}'
|
99 |
+
return log
|
100 |
+
|
101 |
+
|
102 |
+
def generate_log(log_path, images_path, vlm, llm):
|
103 |
+
vs = VectorSearch()
|
104 |
+
fi = FaissIndex(768, f"{images_path}/video.index")
|
105 |
+
fi.reset()
|
106 |
+
with open(log_path, "w") as f:
|
107 |
+
|
108 |
+
for image in tqdm(sorted(glob.glob(f"{images_path}/*.jpg"))):
|
109 |
+
video_name, timestamp, frame = Path(image).stem.split("_")
|
110 |
+
try:
|
111 |
+
log = log_activity_from_image(image, frame, vlm, llm, vs, fi)
|
112 |
+
print(log)
|
113 |
+
f.write(f"{frame}:{log}\n")
|
114 |
+
except Exception as e:
|
115 |
+
print(e)
|
116 |
+
continue
|
models.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from PIL import Image
|
3 |
+
|
4 |
+
import requests
|
5 |
+
from transformers import CLIPProcessor, CLIPModel
|
6 |
+
|
7 |
+
from embeddings import logger
|
8 |
+
|
9 |
+
with open("hf_api.key") as f:
|
10 |
+
HF_TOKEN = f.read().strip()
|
11 |
+
|
12 |
+
|
13 |
+
class HuggingFaceHosted:
|
14 |
+
def __init__(self, model_id, api_token, verbose=False):
|
15 |
+
self.model_id = model_id
|
16 |
+
self.api_token = api_token
|
17 |
+
self.verbose = verbose
|
18 |
+
|
19 |
+
def query(self, data):
|
20 |
+
headers = {"Authorization": f"Bearer {self.api_token}"}
|
21 |
+
API_URL = f"https://api-inference.huggingface.co/models/{self.model_id}"
|
22 |
+
response = requests.request("POST", API_URL, headers=headers, data=data)
|
23 |
+
return json.loads(response.content.decode("utf-8"))
|
24 |
+
|
25 |
+
def fill_mask(self, text):
|
26 |
+
data = json.dumps({"inputs": text})
|
27 |
+
return self.query(data)
|
28 |
+
|
29 |
+
def text_generation(self, text, **parameters):
|
30 |
+
payload = {
|
31 |
+
"inputs": text,
|
32 |
+
"parameters": parameters,
|
33 |
+
}
|
34 |
+
if self.verbose:
|
35 |
+
logger.info(payload)
|
36 |
+
data = json.dumps(payload)
|
37 |
+
return self.query(data)
|
38 |
+
|
39 |
+
def summarization(self, text, do_sample=False):
|
40 |
+
data = json.dumps({"inputs": text, "parameters": {"do_sample": do_sample}})
|
41 |
+
return self.query(data)
|
42 |
+
|
43 |
+
def question_answering(self, question, context):
|
44 |
+
data = json.dumps(
|
45 |
+
{
|
46 |
+
"inputs": {
|
47 |
+
"question": question,
|
48 |
+
"context": context,
|
49 |
+
}
|
50 |
+
}
|
51 |
+
)
|
52 |
+
return self.query(data)
|
53 |
+
|
54 |
+
|
55 |
+
class CLIP:
|
56 |
+
def __init__(self, model_id="openai/clip-vit-large-patch14"):
|
57 |
+
self.model_id = model_id
|
58 |
+
self.model = CLIPModel.from_pretrained(model_id)
|
59 |
+
self.processor = CLIPProcessor.from_pretrained(model_id)
|
60 |
+
|
61 |
+
def get_image_emb(self, image):
|
62 |
+
if isinstance(image, str):
|
63 |
+
image = Image.open(image)
|
64 |
+
image_inputs = self.processor(images=image, return_tensors="pt", padding=True)
|
65 |
+
out = self.model.get_image_features(**image_inputs)
|
66 |
+
|
67 |
+
return out.detach().numpy()
|
68 |
+
|
69 |
+
def get_text_emb(self, text):
|
70 |
+
text_inputs = self.processor(text=text, return_tensors="pt", padding=True)
|
71 |
+
out = self.model.get_text_features(**text_inputs)
|
72 |
+
|
73 |
+
return out.detach().numpy()
|
74 |
+
|
75 |
+
def __repr__(self):
|
76 |
+
return f"CLIP Local <{self.model_id}>"
|
77 |
+
|
78 |
+
|
79 |
+
class GPTJ(HuggingFaceHosted):
|
80 |
+
def __init__(
|
81 |
+
self, model_id="EleutherAI/gpt-j-6B", api_token=HF_TOKEN, verbose=False
|
82 |
+
):
|
83 |
+
super().__init__(model_id, api_token, verbose=verbose)
|
84 |
+
|
85 |
+
def __call__(self, text, **parameters):
|
86 |
+
return self.text_generation(text, **parameters)
|
87 |
+
|
88 |
+
def __repr__(self):
|
89 |
+
return f"GPTJ Hosted <{self.model_id}>"
|
90 |
+
|
91 |
+
|
92 |
+
class MaskEncoder(HuggingFaceHosted):
|
93 |
+
def __init__(self, model_id="roberta-large", api_token=HF_TOKEN, verbose=False):
|
94 |
+
super().__init__(model_id, api_token, verbose=verbose)
|
95 |
+
|
96 |
+
def __call__(self, text):
|
97 |
+
return self.fill_mask(text)
|
98 |
+
|
99 |
+
def __repr__(self):
|
100 |
+
return f"MaskEncoder Hosted <{self.model_id}>"
|
101 |
+
|
102 |
+
|
103 |
+
class T2T(HuggingFaceHosted):
|
104 |
+
def __init__(self, model_id="bigscience/T0pp", api_token=HF_TOKEN, verbose=False):
|
105 |
+
super().__init__(model_id, api_token, verbose=verbose)
|
106 |
+
|
107 |
+
def __call__(self, text, **parameters):
|
108 |
+
return self.text_generation(text, **parameters)
|
109 |
+
|
110 |
+
def __repr__(self):
|
111 |
+
return f"T2T Hosted <{self.model_id}>"
|
models.yml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
HF_API_KEY: hf_api.key
|
reference_embeddings.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
import faiss
|
5 |
+
|
6 |
+
from embeddings import FaissIndex
|
7 |
+
from models import CLIP
|
8 |
+
|
9 |
+
|
10 |
+
def main(file, index_type):
|
11 |
+
|
12 |
+
clip = CLIP()
|
13 |
+
with open(file) as f:
|
14 |
+
references = f.read().split("\n")
|
15 |
+
|
16 |
+
index = FaissIndex(
|
17 |
+
embedding_size=768,
|
18 |
+
faiss_index_location=f"faiss_indices/{index_type}.index",
|
19 |
+
indexer=faiss.IndexFlatIP,
|
20 |
+
)
|
21 |
+
index.reset()
|
22 |
+
|
23 |
+
if len(references) < 500:
|
24 |
+
ref_embeddings = clip.get_text_emb(references)
|
25 |
+
index.add(ref_embeddings.detach().numpy(), references)
|
26 |
+
else:
|
27 |
+
|
28 |
+
batches = list(range(0, len(references), 300)) + [len(references)]
|
29 |
+
batched_objects = []
|
30 |
+
for idx in range(0, len(batches) - 1):
|
31 |
+
batched_objects.append(references[batches[idx] : batches[idx + 1]])
|
32 |
+
|
33 |
+
for batch in tqdm(batched_objects):
|
34 |
+
ref_embeddings = clip.get_text_emb(batch)
|
35 |
+
index.add(ref_embeddings.detach().numpy(), batch)
|
36 |
+
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
parser = argparse.ArgumentParser()
|
40 |
+
parser.add_argument("file", type=str, help="File containing references")
|
41 |
+
parser.add_argument("index_type", type=str, choices=["places", "objects"])
|
42 |
+
args = parser.parse_args()
|
43 |
+
|
44 |
+
main(args.file, args.index_type)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
faiss-cpu
|
3 |
+
yt_dlp
|
4 |
+
nltk
|
5 |
+
opencv-python-headless
|
6 |
+
torch
|
7 |
+
tqdm
|
8 |
+
streamlit
|
tasks.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
from collections import namedtuple
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
from embeddings import FaissIndex, VectorSearch
|
6 |
+
|
7 |
+
|
8 |
+
class Summary:
|
9 |
+
def __init__(self, video_dir, llm):
|
10 |
+
self.video_dir = video_dir
|
11 |
+
self.llm = llm
|
12 |
+
self.vs = VectorSearch()
|
13 |
+
|
14 |
+
def flatten_list(self, s):
|
15 |
+
if s == []:
|
16 |
+
return s
|
17 |
+
if isinstance(s[0], list):
|
18 |
+
return self.flatten_list(s[0]) + self.flatten_list(s[1:])
|
19 |
+
return s[:1] + self.flatten_list(s[1:])
|
20 |
+
|
21 |
+
def parse_history(self):
|
22 |
+
history = []
|
23 |
+
with open(f"{self.video_dir}/history.txt") as f:
|
24 |
+
for line in f:
|
25 |
+
history.append(line.strip())
|
26 |
+
|
27 |
+
history_proc = []
|
28 |
+
proc = lambda x: list(map(str.strip, x.strip().split(",")))
|
29 |
+
|
30 |
+
Record = namedtuple("Record", "frame places objects activities".split(" "))
|
31 |
+
for hist in history:
|
32 |
+
hist_list = hist.split(":")
|
33 |
+
flat = self.flatten_list([x.split(".") for x in hist_list])
|
34 |
+
frame = flat[0]
|
35 |
+
|
36 |
+
places = proc(flat[3])
|
37 |
+
objects = proc(flat[5])
|
38 |
+
activities = proc(flat[-1])
|
39 |
+
history_proc.append(Record(*[frame, places, objects, activities]))
|
40 |
+
|
41 |
+
return history_proc
|
42 |
+
|
43 |
+
def create_prompts(self, history_proc):
|
44 |
+
split_idx = [i for i in range(len(history_proc)) if i % 5 == 0] + [
|
45 |
+
len(history_proc)
|
46 |
+
]
|
47 |
+
range_idx = [(split_idx[x - 1], split_idx[x]) for x in range(1, len(split_idx))]
|
48 |
+
prompts = []
|
49 |
+
for r in range_idx:
|
50 |
+
prompts.append(self.vs.prompt_summary(history_proc[r[0] : r[1]]))
|
51 |
+
|
52 |
+
return prompts
|
53 |
+
|
54 |
+
def call_model(self, prompts):
|
55 |
+
results = []
|
56 |
+
for prompt in prompts:
|
57 |
+
results.append(self.llm(prompt)[0]["generated_text"])
|
58 |
+
|
59 |
+
return zip(prompts, results)
|
60 |
+
|
61 |
+
def generate_summaries(self):
|
62 |
+
history_proc = self.parse_history()
|
63 |
+
prompts = self.create_prompts(history_proc)
|
64 |
+
results = self.call_model(prompts)
|
65 |
+
return results
|
66 |
+
|
67 |
+
|
68 |
+
class VideoSearch:
|
69 |
+
def __init__(self, video_dir, vlm, llm=None):
|
70 |
+
self.video_dir = video_dir
|
71 |
+
self.fi = FaissIndex(faiss_index_location=f"{self.video_dir}/video.index")
|
72 |
+
self.vlm = vlm
|
73 |
+
self.llm = llm
|
74 |
+
|
75 |
+
def find_nearest_frames(self, query):
|
76 |
+
test = self.vlm.get_text_emb(query)
|
77 |
+
D, I, frames = self.fi.search(test)
|
78 |
+
return D, frames
|
79 |
+
|
80 |
+
def get_images(self, frames, k=5):
|
81 |
+
images = []
|
82 |
+
for frame in frames[:k]:
|
83 |
+
loc = glob.glob(f"{self.video_dir}/*_{frame}.jpg")[0]
|
84 |
+
images.append(Image.open(loc))
|
85 |
+
|
86 |
+
return images
|
87 |
+
|
88 |
+
def search_engine(self, query):
|
89 |
+
|
90 |
+
D, frames = self.find_nearest_frames(query)
|
91 |
+
images = self.get_images(frames)
|
92 |
+
|
93 |
+
return images
|