Spaces:

caltech-animal-tracking
/

Primate_Detection_V2

Sleeping

App Files Files Community

annayding commited on Dec 10, 2024

Commit

bfa3aba

1 Parent(s): 376ad36

first commit

Browse files

Files changed (4) hide show

app.py +116 -0
owl_core.py +130 -0
requirements.txt +10 -0
utils.py +85 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import sys
+# set CUDA_HOME
+os.environ["CUDA_HOME"] = "/usr/local/cuda-12.3/"
+import gradio as gr
+from tqdm import tqdm
+import cv2
+import os
+import numpy as np
+import pandas as pd
+import torch
+from typing import Tuple
+from PIL import Image
+from owl_core import owl_full_video
+def run_owl(input_vid,
+            text_prompt,
+            confidence_threshold,
+            fps_processed,
+            scaling_factor
+            ):
+    new_input_vid = input_vid.replace(" ", "_")
+    os.rename(input_vid, new_input_vid)
+    csv_path, vid_path = owl_full_video(input_vid,
+                                        text_prompt,
+                                        confidence_threshold,
+                                        fps_processed=fps_processed,
+                                        scaling_factor=scaling_factor)
+    global CSV_PATH
+    CSV_PATH = csv_path
+    global VID_PATH
+    VID_PATH = vid_path
+    return vid_path
+def vid_download():
+    """
+    """
+    print(CSV_PATH, VID_PATH)
+    return [CSV_PATH, VID_PATH]
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+            <h1 align="center" style="font-size:xxx-large">🦍 Primate Detection</h1>
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input = gr.Video(label="Input Video", interactive=True)
+            text_prompt = gr.Textbox(label="What do you want to detect? (Multiple species should be separated by commas")
+            with gr.Accordion("Advanced Options", open=False):
+                conf_threshold = gr.Slider(
+                    label="Confidence Threshold",
+                    info="Adjust the threshold to change the sensitivity of the model, lower thresholds being more sensitive.",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.3,
+                    step=0.05
+                )
+                fps_processed = gr.Slider(
+                    label="Frame Detection Rate",
+                    info="Adjust the frame detection rate. I.e. a value of 120 will run detection every 120 frames, a value of 1 will run detection on every frame. Note: the lower the number the slower the processing time.",
+                    minimum=1,
+                    maximum=120,
+                    value=30,
+                    step=1)
+                scaling_factor = gr.Slider(
+                    label="Downsample Factor",
+                    info="Adjust the downsample factor. Note: the higher the number the faster the processing time but lower the accuracy.",
+                    minimum=1,
+                    maximum=5,
+                    value=2,
+                    step=1
+                )
+            # TODO: Make button visible only after a file has been uploaded
+            run_btn = gr.Button(value="Run Detection", visible=True)
+        with gr.Column():
+            vid = gr.Video(label="Output Video", height=350, interactive=False, visible=True)
+            # download_btn = gr.Button(value="Generate Download", visible=True)
+            download_file = gr.Files(label="CSV, Video Output", interactive=False)
+    run_btn.click(fn=run_owl, inputs=[input, text_prompt, conf_threshold, fps_processed, scaling_factor, ], outputs=[vid])
+    vid.change(fn=vid_download, outputs=download_file)
+    # gr.Examples(
+    #     [["baboon_15s.mp4", "baboon", 0.25, 0.25, 1, 1]],
+    #     inputs = [input, text_prompt, conf_threshold, fps_processed, scaling_factor],
+    #     outputs = [vid],
+    #     fn=run_sam_dino,
+    #     cache_examples=True,
+    #     label='Example'
+    #   )
+    gr.DuplicateButton()
+    gr.Markdown(
+        """
+        ## Frequently Asked Questions
+        ##### How can I run the interface on my own computer?
+        By clicking on the three dots on the top right corner of the interface, you will be able to clone the repository or run it with a Docker image on your local machine. \
+        For local machine setup instructions please check the README file.
+        ##### The video is very slow to process, how can I speed it up?
+        You can speed up the processing by adjusting the frame detection rate in the advanced options. The lower the number the slower the processing time. Choosing only\
+        bounding boxes will make the processing faster. You can also duplicate the space using the Duplicate Button and choose a different GPU which will make the processing faster.
+        """
+    )
+demo.launch(share=False)

owl_core.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+from tqdm import tqdm
+import cv2
+import os
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from typing import Tuple
+from PIL import Image
+from utils import plot_predictions, mp4_to_png, vid_stitcher
+from transformers import Owlv2Processor, Owlv2ForObjectDetection
+def preprocess_text(text_prompt: str, num_prompts: int = 1):
+    """
+    Takes a string of text prompts and returns a list of lists of text prompts for each image.
+    i.e. text_prompt = "a, b, c" -> [["a", "b", "c"], ["a", "b", "c"]]
+    """
+    text_prompt = [s.strip() for s in text_prompt.split(",")]
+    text_queries = [text_prompt] * num_prompts
+    # print("text_queries:", text_queries)
+    return text_queries
+def owl_batch_prediction(
+        images: torch.Tensor,
+        text_queries : list[str], # assuming that every image is queried with the same text prompt
+        threshold: float,
+        processor,
+        model,
+        device: str = 'cuda'
+    ):
+    inputs = processor(text=text_queries, images=images, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+     # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+    target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(device)
+    # Convert outputs (bounding boxes and class logits) to COCO API, resizes to original image size and filter by threshold
+    results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=threshold)
+    return results
+def owl_full_video(
+        vid_path: str,
+        text_prompt: str,
+        threshold: float,
+        fps_processed: int = 1,
+        scaling_factor: float = 0.5,
+        processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble"),
+        model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble"),
+        device: str = 'cuda',
+        batch_size: int = 6,
+        ):
+    """ Same as owl_video, but processes the entire video regardless of detection bool.
+        Saves results per frame to a df.
+    """
+    # create new dirs and paths for results
+    filename = os.path.splitext(os.path.basename(vid_path))[0]
+    results_dir = f'../results/{filename}_{datetime.now().strftime("%H%M%S")}'
+    frames_dir = os.path.join(results_dir, "frames")
+    # if the frames directory does not exist, create it and get the frames from the video
+    if not os.path.exists(results_dir):
+        os.makedirs(results_dir, exist_ok=True)
+        os.makedirs(frames_dir, exist_ok=True)
+        # process video and create a directory of video frames
+        fps = mp4_to_png(vid_path, frames_dir, scaling_factor)
+    # get all frame paths
+    frame_filenames = os.listdir(frames_dir)
+    frame_paths = []  # list of frame paths to process based on fps_processed
+    # for every frame processed, add to frame_paths
+    for i, frame in enumerate(frame_filenames):
+        if i % fps_processed == 0:
+            frame_paths.append(os.path.join(frames_dir, frame))
+    # set up df for results
+    df  = pd.DataFrame(columns=["frame", "boxes", "scores", "labels"])
+    # for positive detection frames whether the directory has been created
+    dir_created = False
+    # run owl in batches
+    for i in tqdm(range(0, len(frame_paths), batch_size), desc="Running batches"):
+        frame_nums = [i*fps_processed for i in range(batch_size)]
+        batch_paths = frame_paths[i:i+batch_size]  # paths for this batch
+        images = [Image.open(image_path) for image_path in batch_paths]
+        # run owl on this batch of frames
+        text_queries = preprocess_text(text_prompt, len(batch_paths))
+        results = owl_batch_prediction(images, text_queries, threshold, processor, model, device)
+        # get the labels
+        label_ids = []
+        for entry in results:
+            if entry['labels'].numel() > 0:
+                label_ids.append(entry['labels'].tolist())
+            else:
+                label_ids.append(None)
+        text = text_queries[0] # assuming that all texts in query are the same
+        labels = []
+        # convert label_ids to phrases, if no phrases, append None
+        for idx in label_ids:
+            if idx is not None:
+                idx = [text[id] for id in idx]
+                labels.append(idx)
+            else:
+                labels.append(None)
+        for j, image in enumerate(batch_paths):
+            boxes = results[j]['boxes'].cpu().numpy()
+            scores = results[j]['scores'].cpu().numpy()
+            row = pd.DataFrame({"frame": [image], "boxes": [boxes], "scores": [scores], "labels": [labels[j]]})
+            df = pd.concat([df, row], ignore_index=True)
+            # if there are detections, save the frame replacing the original frame
+            annotated_frame = plot_predictions(image, labels[j], scores, boxes)
+            cv2.imwrite(image, annotated_frame)
+    # save the df to a csv
+    csv_path = f"{results_dir}/{filename}_{threshold}.csv"
+    df.to_csv(csv_path, index=False)
+    # stitch the frames into a video
+    save_path = vid_stitcher(frames_dir, output_path=os.path.join(results_dir, "output.mp4"))
+    return csv_path, save_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==5.8.0
+numpy==2.2.0
+opencv_python==4.7.0.68
+opencv_python_headless==4.8.1.78
+pandas==1.4.2
+Pillow==11.0.0
+supervision==0.25.0
+torch==2.0.1
+tqdm==4.65.0
+transformers==4.36.2

utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torch
+import numpy as np
+import supervision as sv
+import cv2
+import os
+from glob import glob
+from tqdm import tqdm
+def plot_predictions(
+        image: str,
+        labels: list[str],
+        scores: torch.Tensor,
+        boxes: torch.Tensor,
+    ) -> np.ndarray:
+    image_source = cv2.imread(image)
+    image_source = cv2.cvtColor(image_source, cv2.COLOR_BGR2RGB)
+    boxes = sv.Detections(xyxy=boxes.cpu().numpy())
+    labels = [
+        f"{phrase} {logit:.2f}"
+        for phrase, logit
+        in zip(labels, scores)
+    ]
+    bbox_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
+    label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
+    annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
+    annotated_frame = bbox_annotator.annotate(scene=annotated_frame, detections=boxes)
+    annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=boxes, labels=labels)
+    return annotated_frame
+def mp4_to_png(input_path: str, save_path: str, scale_factor: float) -> str:
+    """ Converts mp4 to pngs for each frame of the video.
+        Args: input_path is the path to the mp4 file, save_path is the directory to save the frames.
+        Returns: save_path, fps the number of frames per second.
+    """
+    # get frames per second
+    fps = int(cv2.VideoCapture(input_path).get(cv2.CAP_PROP_FPS))
+    # run subprocess to convert mp4 to pngs
+    os.system(f"ffmpeg -i {input_path} -vf 'scale=iw*{scale_factor}:ih*{scale_factor}, fps={fps}' {save_path}/frame%08d.png")
+    return fps
+def vid_stitcher(frames_dir: str, output_path: str, fps: int = 30) -> str:
+    """
+    Takes a list of frames as numpy arrays and writes them to a video file.
+    """
+    # Get the list of frames
+    frame_list = sorted(glob(os.path.join(frames_dir, 'frame*.png')))
+    # Prepare the VideoWriter
+    frame = cv2.imread(frame_list[0])
+    height, width, _ = frame.shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    # Use multithreading to read frames faster
+    from concurrent.futures import ThreadPoolExecutor
+    with ThreadPoolExecutor() as executor:
+        frames = list(executor.map(cv2.imread, frame_list))
+    # Write frames to the video
+    with tqdm(total=len(frame_list), desc='Stitching frames') as pbar:
+        for frame in frames:
+            out.write(frame)
+            pbar.update(1)
+    return output_path
+def count_pos(phrases, text_target):
+    """
+    Takes a list of list of phrases and calculates the number of lists that have at least one entry that is the target phrase
+    """
+    num_pos = 0
+    for sublist in phrases:
+        if sublist == None:
+            continue
+        for phrase in sublist:
+            if phrase == text_target:
+                num_pos += 1
+                break
+    return num_pos