From 5efe61cdc3bed99edc287395211ef2f5af2b152d Mon Sep 17 00:00:00 2001
From: natanielruiz <nruiz9@gatech.edu>
Date: 星期一, 30 十月 2017 08:32:38 +0800
Subject: [PATCH] Dlib detection for video.
---
code/test_on_video.py | 3
code/test_on_video_dlib.py | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
code/test_on_video_dockerface.py | 3
3 files changed, 167 insertions(+), 2 deletions(-)
diff --git a/code/test_on_video.py b/code/test_on_video.py
index c2e97db..7c51348 100644
--- a/code/test_on_video.py
+++ b/code/test_on_video.py
@@ -126,6 +126,7 @@
ret,frame = video.read()
if ret == False:
break
+ cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
while True:
x_min, y_min, x_max, y_max = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4]))
@@ -145,7 +146,7 @@
x_max = min(frame.shape[1], x_max)
y_max = min(frame.shape[0], y_max)
# Crop face loosely
- img = frame[y_min:y_max,x_min:x_max]
+ img = cv2_frame[y_min:y_max,x_min:x_max]
img = Image.fromarray(img)
# Transform
diff --git a/code/test_on_video_dlib.py b/code/test_on_video_dlib.py
new file mode 100644
index 0000000..0fbb080
--- /dev/null
+++ b/code/test_on_video_dlib.py
@@ -0,0 +1,163 @@
+import sys, os, argparse
+
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+from torchvision import transforms
+import torch.backends.cudnn as cudnn
+import torchvision
+import torch.nn.functional as F
+from PIL import Image
+
+import datasets, hopenet, utils
+
+from skimage import io
+import dlib
+
+def parse_args():
+ """Parse input arguments."""
+ parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
+ parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
+ default=0, type=int)
+ parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
+ default='', type=str)
+ parser.add_argument('--facedetection_model', dest='facedetection_model', help='Path of DLIB face detection model.',
+ default='', type=str)
+ parser.add_argument('--video', dest='video_path', help='Path of video')
+ parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames')
+ parser.add_argument('--output_string', dest='output_string', help='String appended to output file')
+ parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int)
+ parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.)
+ args = parser.parse_args()
+ return args
+
+if __name__ == '__main__':
+ args = parse_args()
+
+ cudnn.enabled = True
+
+ batch_size = 1
+ gpu = args.gpu_id
+ snapshot_path = args.snapshot
+ out_dir = 'output/video'
+ video_path = args.video_path
+
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir)
+
+ if not os.path.exists(args.video_path):
+ sys.exit('Video does not exist')
+
+ # ResNet50 structure
+ model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
+
+ # Dlib face detection model
+ cnn_face_detector = dlib.cnn_face_detection_model_v1(args.facedetection_model)
+
+ print 'Loading snapshot.'
+ # Load snapshot
+ saved_state_dict = torch.load(snapshot_path)
+ model.load_state_dict(saved_state_dict)
+
+ print 'Loading data.'
+
+ transformations = transforms.Compose([transforms.Scale(224),
+ transforms.CenterCrop(224), transforms.ToTensor(),
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+
+ model.cuda(gpu)
+
+ print 'Ready to test network.'
+
+ # Test the Model
+ model.eval() # Change model to 'eval' mode (BN uses moving mean/var).
+ total = 0
+
+ idx_tensor = [idx for idx in xrange(66)]
+ idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
+
+ video = cv2.VideoCapture(video_path)
+
+ # New cv2
+ width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float
+ height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float
+
+ # Define the codec and create VideoWriter object
+ fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+ out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))
+
+ # # Old cv2
+ # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)) # float
+ # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float
+ #
+ # # Define the codec and create VideoWriter object
+ # fourcc = cv2.cv.CV_FOURCC(*'MJPG')
+ # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height))
+
+ frame_num = 1
+
+ while frame_num <= args.n_frames:
+ ret,frame = video.read()
+ if ret == False:
+ break
+
+ cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
+
+ # Dlib detect
+ dets = cnn_face_detector(cv2_frame, 1)
+
+ for idx, det in enumerate(dets):
+ # Get x_min, y_min, x_max, y_max, conf
+ x_min = d.rect.left()
+ y_min = d.rect.top()
+ x_max = d.rect.right()
+ y_max = d.rect.bottom()
+ conf = d.confidence
+ print x_min, y_min, x_max, y_max, conf
+
+ if conf > 0.95:
+ bbox_width = abs(x_max - x_min)
+ bbox_height = abs(y_max - y_min)
+ x_min -= 3 * bbox_width / 4
+ x_max += 3 * bbox_width / 4
+ y_min -= 3 * bbox_height / 4
+ y_max += bbox_height / 4
+ x_min = max(x_min, 0); y_min = max(y_min, 0)
+ x_max = min(frame.shape[1], x_max); y_max = min(frame.shape[0], y_max)
+ # Crop image
+ img = cv2_frame[y_min:y_max,x_min:x_max]
+ img = Image.fromarray(img)
+
+ # Transform
+ img = transformations(img)
+ img_shape = img.size()
+ img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
+ img = Variable(img).cuda(gpu)
+
+ yaw, pitch, roll = model(img)
+
+ yaw_predicted = F.softmax(yaw)
+ pitch_predicted = F.softmax(pitch)
+ roll_predicted = F.softmax(roll)
+ # Get continuous predictions in degrees.
+ yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
+ pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
+ roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
+
+ # Print new frame with cube and axis
+ txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted))
+ # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width)
+ utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
+ # Plot expanded bounding box
+ # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
+
+ out.write(frame)
+ frame_num += 1
+
+ out.release()
+ video.release()
diff --git a/code/test_on_video_dockerface.py b/code/test_on_video_dockerface.py
index 327b99a..9f09824 100644
--- a/code/test_on_video_dockerface.py
+++ b/code/test_on_video_dockerface.py
@@ -126,6 +126,7 @@
ret,frame = video.read()
if ret == False:
break
+ cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
while True:
x_min, y_min, x_max, y_max, conf = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])), float(line[5])
@@ -146,7 +147,7 @@
x_max = min(frame.shape[1], x_max)
y_max = min(frame.shape[0], y_max)
# Crop image
- img = frame[y_min:y_max,x_min:x_max]
+ img = cv2_frame[y_min:y_max,x_min:x_max]
img = Image.fromarray(img)
# Transform
--
Gitblit v1.8.0