Spaces:

spencer
/

socm

Runtime error

App Files Files Community

spencer commited on May 10, 2022

Commit

6df828c

1 Parent(s): c6527ad

add normal files

Browse files

Files changed (8) hide show

app.py +78 -0
embeddings.py +143 -0
log_generation.py +116 -0
models.py +111 -0
models.yml +1 -0
reference_embeddings.py +44 -0
requirements.txt +8 -0
tasks.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import streamlit as st
+from models import CLIP, T2T
+from tasks import Summary, VideoSearch
+from log_generation import download_youtube, extract_video_frames, generate_log
+st.set_page_config(page_title="Socratic Models Demo", page_icon="", layout="wide")
+st.title("Socratic Models Demo")
+if "vlm" not in st.session_state:
+    st.session_state.vlm = CLIP()
+if "llm" not in st.session_state:
+    st.session_state.llm = T2T()
+col1, col2, _ = st.columns([2, 2, 3])
+with col1:
+    url = st.text_input(
+        "YouTube Video URL", "https://www.youtube.com/watch?v=tQG6jYy9xto"
+    )
+    video_id = url.split("watch?v=")[-1]
+with col2:
+    st.video(url)
+if not os.path.exists(f"{video_id}"):
+    st.write("Video not found locally. Downloading may take several minutes. Continue?")
+    click = st.button("Download")
+    if not click:
+        st.stop()
+    st.success("Downloading...")
+    download_youtube(url)
+    st.write("Extracting frames...")
+    extract_video_frames(
+        f"{video_id}/{video_id}.mp4", dims=(600, 400), sampling_rate=100
+    )
+    st.write("Generating log...")
+    generate_log(
+        f"{video_id}/history.txt",
+        f"{video_id}",
+        st.session_state.vlm,
+        st.session_state.llm,
+    )
+    refresh = st.button("Click to refresh")
+    if not refresh:
+        st.stop()
+search = VideoSearch(video_id, st.session_state.vlm)
+st.title("Video Search")
+query = st.text_input("Search Query", "working at my computer")
+images = search.search_engine(query)
+with st.expander(label="See results"):
+    for image in images:
+        st.image(image)
+st.title("Event Summaries")
+summ = Summary(video_id, st.session_state.llm)
+summaries = summ.generate_summaries()
+with st.expander(label="See results"):
+    for (prompt, result) in summaries:
+        st.markdown("*Event Log*")
+        st.write(prompt)
+        st.markdown("*Summary*")
+        st.write(result)
+st.title("Video Event Log")
+with open(f"{video_id}/history.txt", "r") as f:
+    st.text(f.read())

embeddings.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import logging
+import os
+import faiss
+import torch
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+class FaissIndex:
+    def __init__(
+        self,
+        embedding_size=None,
+        faiss_index_location=None,
+        indexer=faiss.IndexFlatIP,
+    ):
+        if embedding_size or faiss_index_location:
+            self.embedding_size = embedding_size
+        else:
+            raise ValueError("Must provide embedding_size")
+        self.faiss_index_location = faiss_index_location
+        if faiss_index_location and os.path.exists(faiss_index_location):
+            self.index = faiss.read_index(faiss_index_location)
+            logger.info(f"Setting embedding size ({self.index.d}) to match saved index")
+            self.embedding_size = self.index.d
+            if os.path.exists(faiss_index_location + ".ids"):
+                with open(faiss_index_location + ".ids") as f:
+                    self.id_list = f.read().split("\n")
+            elif self.index.ntotal > 0:
+                raise ValueError("Index file exists but ids file does not")
+            else:
+                self.id_list = []
+        else:
+            os.makedirs(os.path.dirname(faiss_index_location), exist_ok=True)
+            self.index = None
+            self.indexer = indexer
+            self.id_list = []
+    def faiss_init(self):
+        index = self.indexer(self.embedding_size)
+        if self.faiss_index_location:
+            faiss.write_index(index, self.faiss_index_location)
+        self.index = index
+    def add(self, inputs, ids, normalize=True):
+        if not self.index:
+            self.faiss_init()
+        if normalize:
+            faiss.normalize_L2(inputs)
+        self.index.add(inputs)
+        self.id_list.extend(ids)
+        faiss.write_index(self.index, self.faiss_index_location)
+        with open(self.faiss_index_location + ".ids", "a") as f:
+            f.write("\n".join(ids) + "\n")
+    def search(self, embedding, k=10, normalize=True):
+        if len(embedding.shape):
+            embedding = embedding.reshape(1, -1)
+        if normalize:
+            faiss.normalize_L2(embedding)
+        D, I = self.index.search(embedding, k)
+        labels = [self.id_list[i] for i in I.squeeze()]
+        return D, I, labels
+    def reset(self):
+        if self.index:
+            self.index.reset()
+        self.id_list = []
+        try:
+            os.remove(self.faiss_index_location)
+            os.remove(self.faiss_index_location + ".ids")
+        except FileNotFoundError:
+            pass
+    def __len__(self):
+        if self.index:
+            return self.index.ntotal
+        return 0
+class VectorSearch:
+    def __init__(self):
+        self.places = self.load("places")
+        self.objects = self.load("objects")
+    def load(self, index_name):
+        return FaissIndex(
+            faiss_index_location=f"faiss_indices/{index_name}.index",
+        )
+    def top_places(self, query_vec, k=5):
+        if isinstance(query_vec, torch.Tensor):
+            query_vec = query_vec.detach().numpy()
+        *_, results = self.places.search(query_vec, k=k)
+        return results
+    def top_objects(self, query_vec, k=5):
+        if isinstance(query_vec, torch.Tensor):
+            query_vec = query_vec.detach().numpy()
+        *_, results = self.objects.search(query_vec, k=k)
+        return results
+    def prompt_activities(self, query_vec, k=5, one_shot=False):
+        places = self.top_places(query_vec, k=k)
+        objects = self.top_objects(query_vec, k=k)
+        place_str = f"Places: {', '.join(places)}. "
+        object_str = f"Objects: {', '.join(objects)}. "
+        act_str = "I might be doing these 3 activities: "
+        zs = place_str + object_str + act_str
+        example = (
+            "Places: kitchen. Objects: coffee maker. "
+            f"{act_str}: eating, making breakfast, grinding coffee.\n "
+        )
+        fs = example + place_str + object_str + act_str
+        if one_shot:
+            return (zs, fs)
+        return zs, places, objects
+    def prompt_summary(self, state_history: list, k=5):
+        rec_strings = ["Event log:"]
+        for rec in state_history:
+            rec_strings.append(
+                f"Places: {', '.join(rec.places)}. "
+                f"Objects: {', '.join(rec.objects)}. "
+                f"Activities: {', '.join(rec.activities)} "
+            )
+        question = "How would you summarize these events in a few full sentences? "
+        return "\n".join(rec_strings) + "\n" + question

log_generation.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import glob
+import string
+from datetime import datetime
+from pathlib import Path
+import cv2
+import yt_dlp
+from nltk.tokenize import sent_tokenize
+from tqdm import tqdm
+from embeddings import VectorSearch, FaissIndex
+def download_youtube(url, parent_dir="."):
+    def extract_youtube_id(url):
+        return url.split("watch?v=")[-1]
+    video_path = extract_youtube_id(url)
+    ydl_opts = {
+        "format": "mp4",
+        "outtmpl": f"{parent_dir}/{video_path}/{video_path}.%(ext)s",
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        error_code = ydl.download([url])
+    return error_code
+def extract_video_frames(video_path, dims=(600, 400), sampling_rate=100):
+    video_dir = str(Path(video_path).parent)
+    video_name = str(Path(video_path).stem)
+    cap = cv2.VideoCapture(video_path)
+    i = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if i % sampling_rate == 0:
+            print(i)
+            frame = cv2.resize(frame, dims, fx=0, fy=0, interpolation=cv2.INTER_CUBIC)
+            timestamp = datetime.utcnow().timestamp()
+            cv2.imwrite(f"{video_dir}/{video_name}_{timestamp}_{i}.jpg", frame)
+        i += 1
+    cap.release()
+    cv2.destroyAllWindows()
+def strip_punctuation(text):
+    return text.translate(str.maketrans("", "", string.punctuation))
+def clean_response(act_text):
+    act_text = act_text.lower().replace("\n", "")
+    text_split = act_text.split("places")[0]
+    if not text_split:
+        text_split = act_text
+    try:
+        first_sent = sent_tokenize(text_split)[0]
+    except:
+        first_sent = text_split
+    list_split = first_sent.split(",")
+    no_spaces = list(map(str.strip, list_split))
+    return list(map(strip_punctuation, no_spaces))[:3]
+def log_activity_from_image(image_file, frame, vlm, llm, vs, fi):
+    img_embed = vlm.get_image_emb(image_file)
+    fi.add(img_embed, [frame])
+    zs, places, objects = vs.prompt_activities(img_embed, 3)
+    # kwargs = {
+    #     "top_p": 0.9,
+    #     "temperature": 1.2,
+    #     "max_new_tokens": 20,
+    #     "return_full_text": False,
+    # }
+    activities_raw = llm(zs)
+    act_text = activities_raw[0]["generated_text"].lower()
+    activities_clean = clean_response(act_text)
+    log = (
+        f"{frame}:"
+        f"Places: {', '.join(places)}. "
+        f"Objects: {', '.join(objects)}. "
+        f"Activities: {', '.join(activities_clean)}"
+    )
+    # log = f'{zs} {", ".join(activities_clean)}'
+    return log
+def generate_log(log_path, images_path, vlm, llm):
+    vs = VectorSearch()
+    fi = FaissIndex(768, f"{images_path}/video.index")
+    fi.reset()
+    with open(log_path, "w") as f:
+        for image in tqdm(sorted(glob.glob(f"{images_path}/*.jpg"))):
+            video_name, timestamp, frame = Path(image).stem.split("_")
+            try:
+                log = log_activity_from_image(image, frame, vlm, llm, vs, fi)
+                print(log)
+                f.write(f"{frame}:{log}\n")
+            except Exception as e:
+                print(e)
+                continue

models.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import json
+from PIL import Image
+import requests
+from transformers import CLIPProcessor, CLIPModel
+from embeddings import logger
+with open("hf_api.key") as f:
+    HF_TOKEN = f.read().strip()
+class HuggingFaceHosted:
+    def __init__(self, model_id, api_token, verbose=False):
+        self.model_id = model_id
+        self.api_token = api_token
+        self.verbose = verbose
+    def query(self, data):
+        headers = {"Authorization": f"Bearer {self.api_token}"}
+        API_URL = f"https://api-inference.huggingface.co/models/{self.model_id}"
+        response = requests.request("POST", API_URL, headers=headers, data=data)
+        return json.loads(response.content.decode("utf-8"))
+    def fill_mask(self, text):
+        data = json.dumps({"inputs": text})
+        return self.query(data)
+    def text_generation(self, text, **parameters):
+        payload = {
+            "inputs": text,
+            "parameters": parameters,
+        }
+        if self.verbose:
+            logger.info(payload)
+        data = json.dumps(payload)
+        return self.query(data)
+    def summarization(self, text, do_sample=False):
+        data = json.dumps({"inputs": text, "parameters": {"do_sample": do_sample}})
+        return self.query(data)
+    def question_answering(self, question, context):
+        data = json.dumps(
+            {
+                "inputs": {
+                    "question": question,
+                    "context": context,
+                }
+            }
+        )
+        return self.query(data)
+class CLIP:
+    def __init__(self, model_id="openai/clip-vit-large-patch14"):
+        self.model_id = model_id
+        self.model = CLIPModel.from_pretrained(model_id)
+        self.processor = CLIPProcessor.from_pretrained(model_id)
+    def get_image_emb(self, image):
+        if isinstance(image, str):
+            image = Image.open(image)
+        image_inputs = self.processor(images=image, return_tensors="pt", padding=True)
+        out = self.model.get_image_features(**image_inputs)
+        return out.detach().numpy()
+    def get_text_emb(self, text):
+        text_inputs = self.processor(text=text, return_tensors="pt", padding=True)
+        out = self.model.get_text_features(**text_inputs)
+        return out.detach().numpy()
+    def __repr__(self):
+        return f"CLIP Local <{self.model_id}>"
+class GPTJ(HuggingFaceHosted):
+    def __init__(
+        self, model_id="EleutherAI/gpt-j-6B", api_token=HF_TOKEN, verbose=False
+    ):
+        super().__init__(model_id, api_token, verbose=verbose)
+    def __call__(self, text, **parameters):
+        return self.text_generation(text, **parameters)
+    def __repr__(self):
+        return f"GPTJ Hosted <{self.model_id}>"
+class MaskEncoder(HuggingFaceHosted):
+    def __init__(self, model_id="roberta-large", api_token=HF_TOKEN, verbose=False):
+        super().__init__(model_id, api_token, verbose=verbose)
+    def __call__(self, text):
+        return self.fill_mask(text)
+    def __repr__(self):
+        return f"MaskEncoder Hosted <{self.model_id}>"
+class T2T(HuggingFaceHosted):
+    def __init__(self, model_id="bigscience/T0pp", api_token=HF_TOKEN, verbose=False):
+        super().__init__(model_id, api_token, verbose=verbose)
+    def __call__(self, text, **parameters):
+        return self.text_generation(text, **parameters)
+    def __repr__(self):
+        return f"T2T Hosted <{self.model_id}>"

models.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ HF_API_KEY: hf_api.key

reference_embeddings.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import argparse
+from tqdm import tqdm
+import faiss
+from embeddings import FaissIndex
+from models import CLIP
+def main(file, index_type):
+    clip = CLIP()
+    with open(file) as f:
+        references = f.read().split("\n")
+    index = FaissIndex(
+        embedding_size=768,
+        faiss_index_location=f"faiss_indices/{index_type}.index",
+        indexer=faiss.IndexFlatIP,
+    )
+    index.reset()
+    if len(references) < 500:
+        ref_embeddings = clip.get_text_emb(references)
+        index.add(ref_embeddings.detach().numpy(), references)
+    else:
+        batches = list(range(0, len(references), 300)) + [len(references)]
+        batched_objects = []
+        for idx in range(0, len(batches) - 1):
+            batched_objects.append(references[batches[idx] : batches[idx + 1]])
+        for batch in tqdm(batched_objects):
+            ref_embeddings = clip.get_text_emb(batch)
+            index.add(ref_embeddings.detach().numpy(), batch)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("file", type=str, help="File containing references")
+    parser.add_argument("index_type", type=str, choices=["places", "objects"])
+    args = parser.parse_args()
+    main(args.file, args.index_type)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers
+faiss-cpu
+yt_dlp
+nltk
+opencv-python-headless
+torch
+tqdm
+streamlit

tasks.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import glob
+from collections import namedtuple
+from PIL import Image
+from embeddings import FaissIndex, VectorSearch
+class Summary:
+    def __init__(self, video_dir, llm):
+        self.video_dir = video_dir
+        self.llm = llm
+        self.vs = VectorSearch()
+    def flatten_list(self, s):
+        if s == []:
+            return s
+        if isinstance(s[0], list):
+            return self.flatten_list(s[0]) + self.flatten_list(s[1:])
+        return s[:1] + self.flatten_list(s[1:])
+    def parse_history(self):
+        history = []
+        with open(f"{self.video_dir}/history.txt") as f:
+            for line in f:
+                history.append(line.strip())
+        history_proc = []
+        proc = lambda x: list(map(str.strip, x.strip().split(",")))
+        Record = namedtuple("Record", "frame places objects activities".split(" "))
+        for hist in history:
+            hist_list = hist.split(":")
+            flat = self.flatten_list([x.split(".") for x in hist_list])
+            frame = flat[0]
+            places = proc(flat[3])
+            objects = proc(flat[5])
+            activities = proc(flat[-1])
+            history_proc.append(Record(*[frame, places, objects, activities]))
+        return history_proc
+    def create_prompts(self, history_proc):
+        split_idx = [i for i in range(len(history_proc)) if i % 5 == 0] + [
+            len(history_proc)
+        ]
+        range_idx = [(split_idx[x - 1], split_idx[x]) for x in range(1, len(split_idx))]
+        prompts = []
+        for r in range_idx:
+            prompts.append(self.vs.prompt_summary(history_proc[r[0] : r[1]]))
+        return prompts
+    def call_model(self, prompts):
+        results = []
+        for prompt in prompts:
+            results.append(self.llm(prompt)[0]["generated_text"])
+        return zip(prompts, results)
+    def generate_summaries(self):
+        history_proc = self.parse_history()
+        prompts = self.create_prompts(history_proc)
+        results = self.call_model(prompts)
+        return results
+class VideoSearch:
+    def __init__(self, video_dir, vlm, llm=None):
+        self.video_dir = video_dir
+        self.fi = FaissIndex(faiss_index_location=f"{self.video_dir}/video.index")
+        self.vlm = vlm
+        self.llm = llm
+    def find_nearest_frames(self, query):
+        test = self.vlm.get_text_emb(query)
+        D, I, frames = self.fi.search(test)
+        return D, frames
+    def get_images(self, frames, k=5):
+        images = []
+        for frame in frames[:k]:
+            loc = glob.glob(f"{self.video_dir}/*_{frame}.jpg")[0]
+            images.append(Image.open(loc))
+        return images
+    def search_engine(self, query):
+        D, frames = self.find_nearest_frames(query)
+        images = self.get_images(frames)
+        return images