spencer commited on
Commit
6df828c
1 Parent(s): c6527ad

add normal files

Browse files
Files changed (8) hide show
  1. app.py +78 -0
  2. embeddings.py +143 -0
  3. log_generation.py +116 -0
  4. models.py +111 -0
  5. models.yml +1 -0
  6. reference_embeddings.py +44 -0
  7. requirements.txt +8 -0
  8. tasks.py +93 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import streamlit as st
4
+
5
+ from models import CLIP, T2T
6
+ from tasks import Summary, VideoSearch
7
+ from log_generation import download_youtube, extract_video_frames, generate_log
8
+
9
+
10
+ st.set_page_config(page_title="Socratic Models Demo", page_icon="", layout="wide")
11
+ st.title("Socratic Models Demo")
12
+
13
+ if "vlm" not in st.session_state:
14
+ st.session_state.vlm = CLIP()
15
+
16
+ if "llm" not in st.session_state:
17
+ st.session_state.llm = T2T()
18
+
19
+
20
+ col1, col2, _ = st.columns([2, 2, 3])
21
+ with col1:
22
+ url = st.text_input(
23
+ "YouTube Video URL", "https://www.youtube.com/watch?v=tQG6jYy9xto"
24
+ )
25
+ video_id = url.split("watch?v=")[-1]
26
+
27
+ with col2:
28
+ st.video(url)
29
+
30
+ if not os.path.exists(f"{video_id}"):
31
+ st.write("Video not found locally. Downloading may take several minutes. Continue?")
32
+
33
+ click = st.button("Download")
34
+ if not click:
35
+ st.stop()
36
+
37
+ st.success("Downloading...")
38
+ download_youtube(url)
39
+ st.write("Extracting frames...")
40
+ extract_video_frames(
41
+ f"{video_id}/{video_id}.mp4", dims=(600, 400), sampling_rate=100
42
+ )
43
+ st.write("Generating log...")
44
+ generate_log(
45
+ f"{video_id}/history.txt",
46
+ f"{video_id}",
47
+ st.session_state.vlm,
48
+ st.session_state.llm,
49
+ )
50
+ refresh = st.button("Click to refresh")
51
+ if not refresh:
52
+ st.stop()
53
+
54
+
55
+ search = VideoSearch(video_id, st.session_state.vlm)
56
+
57
+ st.title("Video Search")
58
+ query = st.text_input("Search Query", "working at my computer")
59
+ images = search.search_engine(query)
60
+ with st.expander(label="See results"):
61
+ for image in images:
62
+ st.image(image)
63
+
64
+
65
+ st.title("Event Summaries")
66
+ summ = Summary(video_id, st.session_state.llm)
67
+ summaries = summ.generate_summaries()
68
+ with st.expander(label="See results"):
69
+ for (prompt, result) in summaries:
70
+ st.markdown("*Event Log*")
71
+ st.write(prompt)
72
+ st.markdown("*Summary*")
73
+ st.write(result)
74
+
75
+
76
+ st.title("Video Event Log")
77
+ with open(f"{video_id}/history.txt", "r") as f:
78
+ st.text(f.read())
embeddings.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ import faiss
5
+ import torch
6
+
7
+ logger = logging.getLogger(__name__)
8
+ logging.basicConfig(level=logging.INFO)
9
+
10
+
11
+ class FaissIndex:
12
+ def __init__(
13
+ self,
14
+ embedding_size=None,
15
+ faiss_index_location=None,
16
+ indexer=faiss.IndexFlatIP,
17
+ ):
18
+
19
+ if embedding_size or faiss_index_location:
20
+ self.embedding_size = embedding_size
21
+ else:
22
+ raise ValueError("Must provide embedding_size")
23
+
24
+ self.faiss_index_location = faiss_index_location
25
+ if faiss_index_location and os.path.exists(faiss_index_location):
26
+ self.index = faiss.read_index(faiss_index_location)
27
+ logger.info(f"Setting embedding size ({self.index.d}) to match saved index")
28
+ self.embedding_size = self.index.d
29
+ if os.path.exists(faiss_index_location + ".ids"):
30
+ with open(faiss_index_location + ".ids") as f:
31
+ self.id_list = f.read().split("\n")
32
+ elif self.index.ntotal > 0:
33
+ raise ValueError("Index file exists but ids file does not")
34
+ else:
35
+ self.id_list = []
36
+
37
+ else:
38
+ os.makedirs(os.path.dirname(faiss_index_location), exist_ok=True)
39
+ self.index = None
40
+ self.indexer = indexer
41
+ self.id_list = []
42
+
43
+ def faiss_init(self):
44
+
45
+ index = self.indexer(self.embedding_size)
46
+ if self.faiss_index_location:
47
+ faiss.write_index(index, self.faiss_index_location)
48
+ self.index = index
49
+
50
+ def add(self, inputs, ids, normalize=True):
51
+
52
+ if not self.index:
53
+ self.faiss_init()
54
+
55
+ if normalize:
56
+ faiss.normalize_L2(inputs)
57
+ self.index.add(inputs)
58
+ self.id_list.extend(ids)
59
+
60
+ faiss.write_index(self.index, self.faiss_index_location)
61
+ with open(self.faiss_index_location + ".ids", "a") as f:
62
+ f.write("\n".join(ids) + "\n")
63
+
64
+ def search(self, embedding, k=10, normalize=True):
65
+
66
+ if len(embedding.shape):
67
+ embedding = embedding.reshape(1, -1)
68
+ if normalize:
69
+ faiss.normalize_L2(embedding)
70
+ D, I = self.index.search(embedding, k)
71
+ labels = [self.id_list[i] for i in I.squeeze()]
72
+ return D, I, labels
73
+
74
+ def reset(self):
75
+
76
+ if self.index:
77
+ self.index.reset()
78
+ self.id_list = []
79
+ try:
80
+ os.remove(self.faiss_index_location)
81
+ os.remove(self.faiss_index_location + ".ids")
82
+ except FileNotFoundError:
83
+ pass
84
+
85
+ def __len__(self):
86
+ if self.index:
87
+ return self.index.ntotal
88
+ return 0
89
+
90
+
91
+ class VectorSearch:
92
+ def __init__(self):
93
+ self.places = self.load("places")
94
+ self.objects = self.load("objects")
95
+
96
+ def load(self, index_name):
97
+ return FaissIndex(
98
+ faiss_index_location=f"faiss_indices/{index_name}.index",
99
+ )
100
+
101
+ def top_places(self, query_vec, k=5):
102
+ if isinstance(query_vec, torch.Tensor):
103
+ query_vec = query_vec.detach().numpy()
104
+ *_, results = self.places.search(query_vec, k=k)
105
+ return results
106
+
107
+ def top_objects(self, query_vec, k=5):
108
+ if isinstance(query_vec, torch.Tensor):
109
+ query_vec = query_vec.detach().numpy()
110
+ *_, results = self.objects.search(query_vec, k=k)
111
+ return results
112
+
113
+ def prompt_activities(self, query_vec, k=5, one_shot=False):
114
+ places = self.top_places(query_vec, k=k)
115
+ objects = self.top_objects(query_vec, k=k)
116
+ place_str = f"Places: {', '.join(places)}. "
117
+ object_str = f"Objects: {', '.join(objects)}. "
118
+
119
+ act_str = "I might be doing these 3 activities: "
120
+
121
+ zs = place_str + object_str + act_str
122
+
123
+ example = (
124
+ "Places: kitchen. Objects: coffee maker. "
125
+ f"{act_str}: eating, making breakfast, grinding coffee.\n "
126
+ )
127
+ fs = example + place_str + object_str + act_str
128
+ if one_shot:
129
+ return (zs, fs)
130
+
131
+ return zs, places, objects
132
+
133
+ def prompt_summary(self, state_history: list, k=5):
134
+
135
+ rec_strings = ["Event log:"]
136
+ for rec in state_history:
137
+ rec_strings.append(
138
+ f"Places: {', '.join(rec.places)}. "
139
+ f"Objects: {', '.join(rec.objects)}. "
140
+ f"Activities: {', '.join(rec.activities)} "
141
+ )
142
+ question = "How would you summarize these events in a few full sentences? "
143
+ return "\n".join(rec_strings) + "\n" + question
log_generation.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import string
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ import cv2
7
+ import yt_dlp
8
+ from nltk.tokenize import sent_tokenize
9
+ from tqdm import tqdm
10
+
11
+ from embeddings import VectorSearch, FaissIndex
12
+
13
+
14
+ def download_youtube(url, parent_dir="."):
15
+ def extract_youtube_id(url):
16
+ return url.split("watch?v=")[-1]
17
+
18
+ video_path = extract_youtube_id(url)
19
+ ydl_opts = {
20
+ "format": "mp4",
21
+ "outtmpl": f"{parent_dir}/{video_path}/{video_path}.%(ext)s",
22
+ }
23
+
24
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
25
+ error_code = ydl.download([url])
26
+
27
+ return error_code
28
+
29
+
30
+ def extract_video_frames(video_path, dims=(600, 400), sampling_rate=100):
31
+ video_dir = str(Path(video_path).parent)
32
+ video_name = str(Path(video_path).stem)
33
+ cap = cv2.VideoCapture(video_path)
34
+
35
+ i = 0
36
+ while cap.isOpened():
37
+ ret, frame = cap.read()
38
+
39
+ if not ret:
40
+ break
41
+
42
+ if i % sampling_rate == 0:
43
+ print(i)
44
+
45
+ frame = cv2.resize(frame, dims, fx=0, fy=0, interpolation=cv2.INTER_CUBIC)
46
+ timestamp = datetime.utcnow().timestamp()
47
+ cv2.imwrite(f"{video_dir}/{video_name}_{timestamp}_{i}.jpg", frame)
48
+
49
+ i += 1
50
+
51
+ cap.release()
52
+ cv2.destroyAllWindows()
53
+
54
+
55
+ def strip_punctuation(text):
56
+ return text.translate(str.maketrans("", "", string.punctuation))
57
+
58
+
59
+ def clean_response(act_text):
60
+
61
+ act_text = act_text.lower().replace("\n", "")
62
+ text_split = act_text.split("places")[0]
63
+ if not text_split:
64
+ text_split = act_text
65
+
66
+ try:
67
+ first_sent = sent_tokenize(text_split)[0]
68
+ except:
69
+ first_sent = text_split
70
+
71
+ list_split = first_sent.split(",")
72
+ no_spaces = list(map(str.strip, list_split))
73
+
74
+ return list(map(strip_punctuation, no_spaces))[:3]
75
+
76
+
77
+ def log_activity_from_image(image_file, frame, vlm, llm, vs, fi):
78
+ img_embed = vlm.get_image_emb(image_file)
79
+ fi.add(img_embed, [frame])
80
+ zs, places, objects = vs.prompt_activities(img_embed, 3)
81
+
82
+ # kwargs = {
83
+ # "top_p": 0.9,
84
+ # "temperature": 1.2,
85
+ # "max_new_tokens": 20,
86
+ # "return_full_text": False,
87
+ # }
88
+ activities_raw = llm(zs)
89
+ act_text = activities_raw[0]["generated_text"].lower()
90
+ activities_clean = clean_response(act_text)
91
+
92
+ log = (
93
+ f"{frame}:"
94
+ f"Places: {', '.join(places)}. "
95
+ f"Objects: {', '.join(objects)}. "
96
+ f"Activities: {', '.join(activities_clean)}"
97
+ )
98
+ # log = f'{zs} {", ".join(activities_clean)}'
99
+ return log
100
+
101
+
102
+ def generate_log(log_path, images_path, vlm, llm):
103
+ vs = VectorSearch()
104
+ fi = FaissIndex(768, f"{images_path}/video.index")
105
+ fi.reset()
106
+ with open(log_path, "w") as f:
107
+
108
+ for image in tqdm(sorted(glob.glob(f"{images_path}/*.jpg"))):
109
+ video_name, timestamp, frame = Path(image).stem.split("_")
110
+ try:
111
+ log = log_activity_from_image(image, frame, vlm, llm, vs, fi)
112
+ print(log)
113
+ f.write(f"{frame}:{log}\n")
114
+ except Exception as e:
115
+ print(e)
116
+ continue
models.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from PIL import Image
3
+
4
+ import requests
5
+ from transformers import CLIPProcessor, CLIPModel
6
+
7
+ from embeddings import logger
8
+
9
+ with open("hf_api.key") as f:
10
+ HF_TOKEN = f.read().strip()
11
+
12
+
13
+ class HuggingFaceHosted:
14
+ def __init__(self, model_id, api_token, verbose=False):
15
+ self.model_id = model_id
16
+ self.api_token = api_token
17
+ self.verbose = verbose
18
+
19
+ def query(self, data):
20
+ headers = {"Authorization": f"Bearer {self.api_token}"}
21
+ API_URL = f"https://api-inference.huggingface.co/models/{self.model_id}"
22
+ response = requests.request("POST", API_URL, headers=headers, data=data)
23
+ return json.loads(response.content.decode("utf-8"))
24
+
25
+ def fill_mask(self, text):
26
+ data = json.dumps({"inputs": text})
27
+ return self.query(data)
28
+
29
+ def text_generation(self, text, **parameters):
30
+ payload = {
31
+ "inputs": text,
32
+ "parameters": parameters,
33
+ }
34
+ if self.verbose:
35
+ logger.info(payload)
36
+ data = json.dumps(payload)
37
+ return self.query(data)
38
+
39
+ def summarization(self, text, do_sample=False):
40
+ data = json.dumps({"inputs": text, "parameters": {"do_sample": do_sample}})
41
+ return self.query(data)
42
+
43
+ def question_answering(self, question, context):
44
+ data = json.dumps(
45
+ {
46
+ "inputs": {
47
+ "question": question,
48
+ "context": context,
49
+ }
50
+ }
51
+ )
52
+ return self.query(data)
53
+
54
+
55
+ class CLIP:
56
+ def __init__(self, model_id="openai/clip-vit-large-patch14"):
57
+ self.model_id = model_id
58
+ self.model = CLIPModel.from_pretrained(model_id)
59
+ self.processor = CLIPProcessor.from_pretrained(model_id)
60
+
61
+ def get_image_emb(self, image):
62
+ if isinstance(image, str):
63
+ image = Image.open(image)
64
+ image_inputs = self.processor(images=image, return_tensors="pt", padding=True)
65
+ out = self.model.get_image_features(**image_inputs)
66
+
67
+ return out.detach().numpy()
68
+
69
+ def get_text_emb(self, text):
70
+ text_inputs = self.processor(text=text, return_tensors="pt", padding=True)
71
+ out = self.model.get_text_features(**text_inputs)
72
+
73
+ return out.detach().numpy()
74
+
75
+ def __repr__(self):
76
+ return f"CLIP Local <{self.model_id}>"
77
+
78
+
79
+ class GPTJ(HuggingFaceHosted):
80
+ def __init__(
81
+ self, model_id="EleutherAI/gpt-j-6B", api_token=HF_TOKEN, verbose=False
82
+ ):
83
+ super().__init__(model_id, api_token, verbose=verbose)
84
+
85
+ def __call__(self, text, **parameters):
86
+ return self.text_generation(text, **parameters)
87
+
88
+ def __repr__(self):
89
+ return f"GPTJ Hosted <{self.model_id}>"
90
+
91
+
92
+ class MaskEncoder(HuggingFaceHosted):
93
+ def __init__(self, model_id="roberta-large", api_token=HF_TOKEN, verbose=False):
94
+ super().__init__(model_id, api_token, verbose=verbose)
95
+
96
+ def __call__(self, text):
97
+ return self.fill_mask(text)
98
+
99
+ def __repr__(self):
100
+ return f"MaskEncoder Hosted <{self.model_id}>"
101
+
102
+
103
+ class T2T(HuggingFaceHosted):
104
+ def __init__(self, model_id="bigscience/T0pp", api_token=HF_TOKEN, verbose=False):
105
+ super().__init__(model_id, api_token, verbose=verbose)
106
+
107
+ def __call__(self, text, **parameters):
108
+ return self.text_generation(text, **parameters)
109
+
110
+ def __repr__(self):
111
+ return f"T2T Hosted <{self.model_id}>"
models.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ HF_API_KEY: hf_api.key
reference_embeddings.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from tqdm import tqdm
3
+
4
+ import faiss
5
+
6
+ from embeddings import FaissIndex
7
+ from models import CLIP
8
+
9
+
10
+ def main(file, index_type):
11
+
12
+ clip = CLIP()
13
+ with open(file) as f:
14
+ references = f.read().split("\n")
15
+
16
+ index = FaissIndex(
17
+ embedding_size=768,
18
+ faiss_index_location=f"faiss_indices/{index_type}.index",
19
+ indexer=faiss.IndexFlatIP,
20
+ )
21
+ index.reset()
22
+
23
+ if len(references) < 500:
24
+ ref_embeddings = clip.get_text_emb(references)
25
+ index.add(ref_embeddings.detach().numpy(), references)
26
+ else:
27
+
28
+ batches = list(range(0, len(references), 300)) + [len(references)]
29
+ batched_objects = []
30
+ for idx in range(0, len(batches) - 1):
31
+ batched_objects.append(references[batches[idx] : batches[idx + 1]])
32
+
33
+ for batch in tqdm(batched_objects):
34
+ ref_embeddings = clip.get_text_emb(batch)
35
+ index.add(ref_embeddings.detach().numpy(), batch)
36
+
37
+
38
+ if __name__ == "__main__":
39
+ parser = argparse.ArgumentParser()
40
+ parser.add_argument("file", type=str, help="File containing references")
41
+ parser.add_argument("index_type", type=str, choices=["places", "objects"])
42
+ args = parser.parse_args()
43
+
44
+ main(args.file, args.index_type)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ faiss-cpu
3
+ yt_dlp
4
+ nltk
5
+ opencv-python-headless
6
+ torch
7
+ tqdm
8
+ streamlit
tasks.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ from collections import namedtuple
3
+ from PIL import Image
4
+
5
+ from embeddings import FaissIndex, VectorSearch
6
+
7
+
8
+ class Summary:
9
+ def __init__(self, video_dir, llm):
10
+ self.video_dir = video_dir
11
+ self.llm = llm
12
+ self.vs = VectorSearch()
13
+
14
+ def flatten_list(self, s):
15
+ if s == []:
16
+ return s
17
+ if isinstance(s[0], list):
18
+ return self.flatten_list(s[0]) + self.flatten_list(s[1:])
19
+ return s[:1] + self.flatten_list(s[1:])
20
+
21
+ def parse_history(self):
22
+ history = []
23
+ with open(f"{self.video_dir}/history.txt") as f:
24
+ for line in f:
25
+ history.append(line.strip())
26
+
27
+ history_proc = []
28
+ proc = lambda x: list(map(str.strip, x.strip().split(",")))
29
+
30
+ Record = namedtuple("Record", "frame places objects activities".split(" "))
31
+ for hist in history:
32
+ hist_list = hist.split(":")
33
+ flat = self.flatten_list([x.split(".") for x in hist_list])
34
+ frame = flat[0]
35
+
36
+ places = proc(flat[3])
37
+ objects = proc(flat[5])
38
+ activities = proc(flat[-1])
39
+ history_proc.append(Record(*[frame, places, objects, activities]))
40
+
41
+ return history_proc
42
+
43
+ def create_prompts(self, history_proc):
44
+ split_idx = [i for i in range(len(history_proc)) if i % 5 == 0] + [
45
+ len(history_proc)
46
+ ]
47
+ range_idx = [(split_idx[x - 1], split_idx[x]) for x in range(1, len(split_idx))]
48
+ prompts = []
49
+ for r in range_idx:
50
+ prompts.append(self.vs.prompt_summary(history_proc[r[0] : r[1]]))
51
+
52
+ return prompts
53
+
54
+ def call_model(self, prompts):
55
+ results = []
56
+ for prompt in prompts:
57
+ results.append(self.llm(prompt)[0]["generated_text"])
58
+
59
+ return zip(prompts, results)
60
+
61
+ def generate_summaries(self):
62
+ history_proc = self.parse_history()
63
+ prompts = self.create_prompts(history_proc)
64
+ results = self.call_model(prompts)
65
+ return results
66
+
67
+
68
+ class VideoSearch:
69
+ def __init__(self, video_dir, vlm, llm=None):
70
+ self.video_dir = video_dir
71
+ self.fi = FaissIndex(faiss_index_location=f"{self.video_dir}/video.index")
72
+ self.vlm = vlm
73
+ self.llm = llm
74
+
75
+ def find_nearest_frames(self, query):
76
+ test = self.vlm.get_text_emb(query)
77
+ D, I, frames = self.fi.search(test)
78
+ return D, frames
79
+
80
+ def get_images(self, frames, k=5):
81
+ images = []
82
+ for frame in frames[:k]:
83
+ loc = glob.glob(f"{self.video_dir}/*_{frame}.jpg")[0]
84
+ images.append(Image.open(loc))
85
+
86
+ return images
87
+
88
+ def search_engine(self, query):
89
+
90
+ D, frames = self.find_nearest_frames(query)
91
+ images = self.get_images(frames)
92
+
93
+ return images