From 5efe61cdc3bed99edc287395211ef2f5af2b152d Mon Sep 17 00:00:00 2001 From: natanielruiz <nruiz9@gatech.edu> Date: 星期一, 30 十月 2017 08:32:38 +0800 Subject: [PATCH] Dlib detection for video. --- code/test_on_video.py | 3 code/test_on_video_dlib.py | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ code/test_on_video_dockerface.py | 3 3 files changed, 167 insertions(+), 2 deletions(-) diff --git a/code/test_on_video.py b/code/test_on_video.py index c2e97db..7c51348 100644 --- a/code/test_on_video.py +++ b/code/test_on_video.py @@ -126,6 +126,7 @@ ret,frame = video.read() if ret == False: break + cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) while True: x_min, y_min, x_max, y_max = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])) @@ -145,7 +146,7 @@ x_max = min(frame.shape[1], x_max) y_max = min(frame.shape[0], y_max) # Crop face loosely - img = frame[y_min:y_max,x_min:x_max] + img = cv2_frame[y_min:y_max,x_min:x_max] img = Image.fromarray(img) # Transform diff --git a/code/test_on_video_dlib.py b/code/test_on_video_dlib.py new file mode 100644 index 0000000..0fbb080 --- /dev/null +++ b/code/test_on_video_dlib.py @@ -0,0 +1,163 @@ +import sys, os, argparse + +import numpy as np +import cv2 +import matplotlib.pyplot as plt + +import torch +import torch.nn as nn +from torch.autograd import Variable +from torch.utils.data import DataLoader +from torchvision import transforms +import torch.backends.cudnn as cudnn +import torchvision +import torch.nn.functional as F +from PIL import Image + +import datasets, hopenet, utils + +from skimage import io +import dlib + +def parse_args(): + """Parse input arguments.""" + parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') + parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', + default=0, type=int) + parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', + default='', type=str) + parser.add_argument('--facedetection_model', dest='facedetection_model', help='Path of DLIB face detection model.', + default='', type=str) + parser.add_argument('--video', dest='video_path', help='Path of video') + parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames') + parser.add_argument('--output_string', dest='output_string', help='String appended to output file') + parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int) + parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.) + args = parser.parse_args() + return args + +if __name__ == '__main__': + args = parse_args() + + cudnn.enabled = True + + batch_size = 1 + gpu = args.gpu_id + snapshot_path = args.snapshot + out_dir = 'output/video' + video_path = args.video_path + + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + if not os.path.exists(args.video_path): + sys.exit('Video does not exist') + + # ResNet50 structure + model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) + + # Dlib face detection model + cnn_face_detector = dlib.cnn_face_detection_model_v1(args.facedetection_model) + + print 'Loading snapshot.' + # Load snapshot + saved_state_dict = torch.load(snapshot_path) + model.load_state_dict(saved_state_dict) + + print 'Loading data.' + + transformations = transforms.Compose([transforms.Scale(224), + transforms.CenterCrop(224), transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) + + model.cuda(gpu) + + print 'Ready to test network.' + + # Test the Model + model.eval() # Change model to 'eval' mode (BN uses moving mean/var). + total = 0 + + idx_tensor = [idx for idx in xrange(66)] + idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) + + video = cv2.VideoCapture(video_path) + + # New cv2 + width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float + height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float + + # Define the codec and create VideoWriter object + fourcc = cv2.VideoWriter_fourcc(*'MJPG') + out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height)) + + # # Old cv2 + # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)) # float + # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float + # + # # Define the codec and create VideoWriter object + # fourcc = cv2.cv.CV_FOURCC(*'MJPG') + # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height)) + + frame_num = 1 + + while frame_num <= args.n_frames: + ret,frame = video.read() + if ret == False: + break + + cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) + + # Dlib detect + dets = cnn_face_detector(cv2_frame, 1) + + for idx, det in enumerate(dets): + # Get x_min, y_min, x_max, y_max, conf + x_min = d.rect.left() + y_min = d.rect.top() + x_max = d.rect.right() + y_max = d.rect.bottom() + conf = d.confidence + print x_min, y_min, x_max, y_max, conf + + if conf > 0.95: + bbox_width = abs(x_max - x_min) + bbox_height = abs(y_max - y_min) + x_min -= 3 * bbox_width / 4 + x_max += 3 * bbox_width / 4 + y_min -= 3 * bbox_height / 4 + y_max += bbox_height / 4 + x_min = max(x_min, 0); y_min = max(y_min, 0) + x_max = min(frame.shape[1], x_max); y_max = min(frame.shape[0], y_max) + # Crop image + img = cv2_frame[y_min:y_max,x_min:x_max] + img = Image.fromarray(img) + + # Transform + img = transformations(img) + img_shape = img.size() + img = img.view(1, img_shape[0], img_shape[1], img_shape[2]) + img = Variable(img).cuda(gpu) + + yaw, pitch, roll = model(img) + + yaw_predicted = F.softmax(yaw) + pitch_predicted = F.softmax(pitch) + roll_predicted = F.softmax(roll) + # Get continuous predictions in degrees. + yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99 + pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99 + roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99 + + # Print new frame with cube and axis + txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted)) + # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width) + utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2) + # Plot expanded bounding box + # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1) + + out.write(frame) + frame_num += 1 + + out.release() + video.release() diff --git a/code/test_on_video_dockerface.py b/code/test_on_video_dockerface.py index 327b99a..9f09824 100644 --- a/code/test_on_video_dockerface.py +++ b/code/test_on_video_dockerface.py @@ -126,6 +126,7 @@ ret,frame = video.read() if ret == False: break + cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) while True: x_min, y_min, x_max, y_max, conf = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])), float(line[5]) @@ -146,7 +147,7 @@ x_max = min(frame.shape[1], x_max) y_max = min(frame.shape[0], y_max) # Crop image - img = frame[y_min:y_max,x_min:x_max] + img = cv2_frame[y_min:y_max,x_min:x_max] img = Image.fromarray(img) # Transform -- Gitblit v1.8.0