Spaces:

cedssama
/

I3D_Sign_Language_Classification

Running

File size: 4,228 Bytes

d3d8d59
 
 
 
2fe95f1
d3d8d59
 
 
2fe95f1
 
d3d8d59
2a7c856
d3d8d59
 
 
 
 
2a7c856
 
d3d8d59
 
17d99b0
2a7c856
17d99b0
 
d3d8d59
2a7c856
d3d8d59
 
 
 
 
2a7c856
 
d3d8d59
 
 
 
 
 
2a7c856
d3d8d59
 
2a7c856
d3d8d59
 
 
 
 
 
 
 
 
2a7c856
d3d8d59
 
2a7c856
d3d8d59
7a783a2
d3d8d59
7a783a2
 
2a7c856
7a783a2
2a7c856
 
d3d8d59
 
2a7c856
 
d3d8d59
7a783a2
 
 
2a7c856
d3d8d59
 
 
2a7c856
d3d8d59
7a783a2
d3d8d59
2a7c856
d3d8d59
 
 
 
2a7c856
 
7a783a2
d3d8d59
2a7c856
d3d8d59
 
2a7c856
d3d8d59
2a7c856
 
 
 
17d99b0
 
 
 
 
 
 
 
d3d8d59
2a7c856
 
d3d8d59
2a7c856
d3d8d59
2a7c856
d3d8d59
 
 
 
2a7c856

import torch
import cv2
import videotransforms
import numpy as np
import gradio as gr
from einops import rearrange
from torchvision import transforms
from pytorch_i3d import InceptionI3d


def preprocess(vidpath):
    # Fetch video
    cap = cv2.VideoCapture(vidpath)

    frames = []
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Extract frames from video
    for _ in range(num):
        _, img = cap.read()
        
        # Skip NoneType frames
        if img is None:
            continue

        # Resize if (w,h) < (226,226)
        w, h, c = img.shape
        if w < 226 or h < 226:
            d = 226. - min(w, h)
            sc = 1 + d / min(w, h)
            img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)

        # Normalize
        img = (img / 255.) * 2 - 1

        frames.append(img)
    
    frames = torch.Tensor(np.asarray(frames, dtype=np.float32))
    
    # Transform tensor and reshape to (1, c, t ,w, h)
    transform = transforms.Compose([videotransforms.CenterCrop(224)])
    frames = transform(frames)
    frames = rearrange(frames, 't w h c-> 1 c t w h')

    return frames

def classify(video,dataset='WLASL100'):
    to_load = {
        'WLASL100':{'logits':100,'path':'weights/asl100/FINAL_nslt_100_iters=896_top1=65.89_top5=84.11_top10=89.92.pt'},
        'WLASL2000':{'logits':2000,'path':'weights/asl2000/FINAL_nslt_2000_iters=5104_top1=32.48_top5=57.31_top10=66.31.pt'}
        }

    # Preprocess video
    input = preprocess(video)

    # Load model
    model = InceptionI3d()
    model.load_state_dict(torch.load('weights/rgb_imagenet.pt',map_location=torch.device('cpu')))
    model.replace_logits(to_load[dataset]['logits'])
    model.load_state_dict(torch.load(to_load[dataset]['path'],map_location=torch.device('cpu')))

    # Run on cpu. Spaces environment is limited to CPU for free users. 
    model.cpu()

    # Evaluation mode
    model.eval()

    with torch.no_grad(): # Disable gradient computation
        per_frame_logits = model(input) # Inference
    
    per_frame_logits.cpu()
    model.cpu()

    # Load predictions
    predictions = rearrange(per_frame_logits,'1 j k -> j k')
    predictions = torch.mean(predictions, dim = 1)

    # Fetch top 10 predictions
    _, index = torch.topk(predictions,10)
    index = index.cpu().numpy()

    # Load labels 
    with open('wlasl_class_list.txt') as f:
        idx2label = dict()
        for line in f:
            idx2label[int(line.split()[0])]=line.split()[1]
    
    # Get probabilities
    predictions = torch.nn.functional.softmax(predictions, dim=0).cpu().numpy()

    # Return dict {label:pred}
    return {idx2label[i]:float(predictions[i]) for i in index}

# Gradio App config
title = "I3D Sign Language Recognition"
description =   "Gradio demo of word-level sign language classification using I3D model pretrained on the WLASL video dataset. " \
                "WLASL is a large-scale dataset containing more than 2000 words in American Sign Language. " \
                "Examples used in the demo are videos from the the test subset. "  \
                "Note that WLASL100 contains 100 words while WLASL2000 contains 2000."
examples = [
        ['videos/no.mp4','WLASL100'],
        ['videos/all.mp4','WLASL100'],
        ['videos/before.mp4','WLASL100'],
        ['videos/blue.mp4','WLASL2000'],
        ['videos/white.mp4','WLASL2000'],
        ['videos/accident2.mp4','WLASL2000']
    ]

article =   "NOTE: This is not the official demonstration of the I3D sign language classification on the WLASL dataset. "\
            "More information about the WLASL dataset and pretrained I3D models can be found <a href=https://github.com/dxli94/WLASL>here</a>."

# Gradio App interface
gr.Interface(   fn=classify,
                inputs=[gr.inputs.Video(label="Video (*.mp4)"),gr.inputs.Radio(choices=['WLASL100','WLASL2000'], default='WLASL100', label='Trained on:')], 
                outputs=[gr.outputs.Label(num_top_classes=5, label='Top 5 Predictions')],
                allow_flagging="never",
                title=title, 
                description=description, 
                examples=examples,
                article=article).launch()