![]() |
![]() |
![]() |
![]() |
![]() |
This Colab demonstrates recognizing actions in video data using the tfhub.dev/deepmind/i3d-kinetics-400/1 module. More models to detect actions in videos can be found here.
The underlying model is described in the paper "Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset" by Joao Carreira and Andrew Zisserman. The paper was posted on arXiv in May 2017, and was published as a CVPR 2017 conference paper. The source code is publicly available on github.
"Quo Vadis" introduced a new architecture for video classification, the Inflated 3D Convnet or I3D. This architecture achieved state-of-the-art results on the UCF101 and HMDB51 datasets from fine-tuning these models. I3D models pre-trained on Kinetics also placed first in the CVPR 2017 Charades challenge.
The original module was trained on the kinetics-400 dateset and knows about 400 different actions. Labels for these actions can be found in the label map file.
In this Colab we will use it recognize activities in videos from a UCF101 dataset.
Setup
pip install -q imageio
pip install -q opencv-python
pip install -q git+https://github.com/tensorflow/docs
Import the necessary modules
# TensorFlow and TF-Hub modules.
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow_docs.vis import embed
logging.set_verbosity(logging.ERROR)
# Some modules to help with reading the UCF101 dataset.
import random
import re
import os
import tempfile
import ssl
import cv2
import numpy as np
# Some modules to display an animation using imageio.
import imageio
from IPython import display
from urllib import request # requires python3
Helper functions for the UCF101 dataset
# Utilities to fetch videos from UCF101 dataset
UCF_ROOT = "https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/"
_VIDEO_LIST = None
_CACHE_DIR = tempfile.mkdtemp()
# As of July 2020, crcv.ucf.edu doesn't use a certificate accepted by the
# default Colab environment anymore.
unverified_context = ssl._create_unverified_context()
def list_ucf_videos():
"""Lists videos available in UCF101 dataset."""
global _VIDEO_LIST
if not _VIDEO_LIST:
index = request.urlopen(UCF_ROOT, context=unverified_context).read().decode("utf-8")
videos = re.findall("(v_[\w_]+\.avi)", index)
_VIDEO_LIST = sorted(set(videos))
return list(_VIDEO_LIST)
def fetch_ucf_video(video):
"""Fetchs a video and cache into local filesystem."""
cache_path = os.path.join(_CACHE_DIR, video)
if not os.path.exists(cache_path):
urlpath = request.urljoin(UCF_ROOT, video)
print("Fetching %s => %s" % (urlpath, cache_path))
data = request.urlopen(urlpath, context=unverified_context).read()
open(cache_path, "wb").write(data)
return cache_path
# Utilities to open video files using CV2
def crop_center_square(frame):
y, x = frame.shape[0:2]
min_dim = min(y, x)
start_x = (x // 2) - (min_dim // 2)
start_y = (y // 2) - (min_dim // 2)
return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]
def load_video(path, max_frames=0, resize=(224, 224)):
cap = cv2.VideoCapture(path)
frames = []
try:
while True:
ret, frame = cap.read()
if not ret:
break
frame = crop_center_square(frame)
frame = cv2.resize(frame, resize)
frame = frame[:, :, [2, 1, 0]]
frames.append(frame)
if len(frames) == max_frames:
break
finally:
cap.release()
return np.array(frames) / 255.0
def to_gif(images):
converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)
imageio.mimsave('./animation.gif', converted_images, duration=40)
return embed.embed_file('./animation.gif')
Get the kinetics-400 labels
# Get the kinetics-400 action labels from the GitHub repository.
KINETICS_URL = "https://raw.githubusercontent.com/deepmind/kinetics-i3d/master/data/label_map.txt"
with request.urlopen(KINETICS_URL) as obj:
labels = [line.decode("utf-8").strip() for line in obj.readlines()]
print("Found %d labels." % len(labels))
Using the UCF101 dataset
# Get the list of videos in the dataset.
ucf_videos = list_ucf_videos()
categories = {}
for video in ucf_videos:
category = video[2:-12]
if category not in categories:
categories[category] = []
categories[category].append(video)
print("Found %d videos in %d categories." % (len(ucf_videos), len(categories)))
for category, sequences in categories.items():
summary = ", ".join(sequences[:2])
print("%-20s %4d videos (%s, ...)" % (category, len(sequences), summary))
# Get a sample cricket video.
video_path = fetch_ucf_video("v_CricketShot_g04_c02.avi")
sample_video = load_video(video_path)
sample_video.shape
i3d = hub.load("https://tfhub.dev/deepmind/i3d-kinetics-400/1").signatures['default']
Run the id3 model and print the top-5 action predictions.
def predict(sample_video):
# Add a batch axis to the sample video.
model_input = tf.constant(sample_video, dtype=tf.float32)[tf.newaxis, ...]
logits = i3d(model_input)['default'][0]
probabilities = tf.nn.softmax(logits)
print("Top 5 actions:")
for i in np.argsort(probabilities)[::-1][:5]:
print(f" {labels[i]:22}: {probabilities[i] * 100:5.2f}%")
predict(sample_video)
Now try a new video, from: https://commons.wikimedia.org/wiki/Category:Videos_of_sports
How about this video by Patrick Gillett:
curl -O https://upload.wikimedia.org/wikipedia/commons/8/86/End_of_a_jam.ogv
video_path = "End_of_a_jam.ogv"
sample_video = load_video(video_path)[:100]
sample_video.shape
to_gif(sample_video)
predict(sample_video)