From 5efe61cdc3bed99edc287395211ef2f5af2b152d Mon Sep 17 00:00:00 2001
From: natanielruiz <nruiz9@gatech.edu>
Date: 星期一, 30 十月 2017 08:32:38 +0800
Subject: [PATCH] Dlib detection for video.

---
 code/test_on_video.py            |    3 
 code/test_on_video_dlib.py       |  163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 code/test_on_video_dockerface.py |    3 
 3 files changed, 167 insertions(+), 2 deletions(-)

diff --git a/code/test_on_video.py b/code/test_on_video.py
index c2e97db..7c51348 100644
--- a/code/test_on_video.py
+++ b/code/test_on_video.py
@@ -126,6 +126,7 @@
         ret,frame = video.read()
         if ret == False:
             break
+        cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
 
         while True:
             x_min, y_min, x_max, y_max = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4]))
@@ -145,7 +146,7 @@
             x_max = min(frame.shape[1], x_max)
             y_max = min(frame.shape[0], y_max)
             # Crop face loosely
-            img = frame[y_min:y_max,x_min:x_max]
+            img = cv2_frame[y_min:y_max,x_min:x_max]
             img = Image.fromarray(img)
 
             # Transform
diff --git a/code/test_on_video_dlib.py b/code/test_on_video_dlib.py
new file mode 100644
index 0000000..0fbb080
--- /dev/null
+++ b/code/test_on_video_dlib.py
@@ -0,0 +1,163 @@
+import sys, os, argparse
+
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+from torchvision import transforms
+import torch.backends.cudnn as cudnn
+import torchvision
+import torch.nn.functional as F
+from PIL import Image
+
+import datasets, hopenet, utils
+
+from skimage import io
+import dlib
+
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
+            default=0, type=int)
+    parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
+          default='', type=str)
+    parser.add_argument('--facedetection_model', dest='facedetection_model', help='Path of DLIB face detection model.',
+          default='', type=str)
+    parser.add_argument('--video', dest='video_path', help='Path of video')
+    parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames')
+    parser.add_argument('--output_string', dest='output_string', help='String appended to output file')
+    parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int)
+    parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    cudnn.enabled = True
+
+    batch_size = 1
+    gpu = args.gpu_id
+    snapshot_path = args.snapshot
+    out_dir = 'output/video'
+    video_path = args.video_path
+
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+
+    if not os.path.exists(args.video_path):
+        sys.exit('Video does not exist')
+
+    # ResNet50 structure
+    model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
+
+    # Dlib face detection model
+    cnn_face_detector = dlib.cnn_face_detection_model_v1(args.facedetection_model)
+
+    print 'Loading snapshot.'
+    # Load snapshot
+    saved_state_dict = torch.load(snapshot_path)
+    model.load_state_dict(saved_state_dict)
+
+    print 'Loading data.'
+
+    transformations = transforms.Compose([transforms.Scale(224),
+    transforms.CenterCrop(224), transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+
+    model.cuda(gpu)
+
+    print 'Ready to test network.'
+
+    # Test the Model
+    model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
+    total = 0
+
+    idx_tensor = [idx for idx in xrange(66)]
+    idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
+
+    video = cv2.VideoCapture(video_path)
+
+    # New cv2
+    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))   # float
+    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float
+
+    # Define the codec and create VideoWriter object
+    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
+    out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))
+
+    # # Old cv2
+    # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH))   # float
+    # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float
+    #
+    # # Define the codec and create VideoWriter object
+    # fourcc = cv2.cv.CV_FOURCC(*'MJPG')
+    # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height))
+
+    frame_num = 1
+
+    while frame_num <= args.n_frames:
+        ret,frame = video.read()
+        if ret == False:
+            break
+
+        cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
+
+        # Dlib detect
+        dets = cnn_face_detector(cv2_frame, 1)
+
+        for idx, det in enumerate(dets):
+            # Get x_min, y_min, x_max, y_max, conf
+            x_min = d.rect.left()
+            y_min = d.rect.top()
+            x_max = d.rect.right()
+            y_max = d.rect.bottom()
+            conf = d.confidence
+            print x_min, y_min, x_max, y_max, conf
+
+            if conf > 0.95:
+                bbox_width = abs(x_max - x_min)
+                bbox_height = abs(y_max - y_min)
+                x_min -= 3 * bbox_width / 4
+                x_max += 3 * bbox_width / 4
+                y_min -= 3 * bbox_height / 4
+                y_max += bbox_height / 4
+                x_min = max(x_min, 0); y_min = max(y_min, 0)
+                x_max = min(frame.shape[1], x_max); y_max = min(frame.shape[0], y_max)
+                # Crop image
+                img = cv2_frame[y_min:y_max,x_min:x_max]
+                img = Image.fromarray(img)
+
+                # Transform
+                img = transformations(img)
+                img_shape = img.size()
+                img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
+                img = Variable(img).cuda(gpu)
+
+                yaw, pitch, roll = model(img)
+
+                yaw_predicted = F.softmax(yaw)
+                pitch_predicted = F.softmax(pitch)
+                roll_predicted = F.softmax(roll)
+                # Get continuous predictions in degrees.
+                yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
+                pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
+                roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
+
+                # Print new frame with cube and axis
+                txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted))
+                # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width)
+                utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
+                # Plot expanded bounding box
+                # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
+
+            out.write(frame)
+            frame_num += 1
+
+    out.release()
+    video.release()
diff --git a/code/test_on_video_dockerface.py b/code/test_on_video_dockerface.py
index 327b99a..9f09824 100644
--- a/code/test_on_video_dockerface.py
+++ b/code/test_on_video_dockerface.py
@@ -126,6 +126,7 @@
         ret,frame = video.read()
         if ret == False:
             break
+        cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
 
         while True:
             x_min, y_min, x_max, y_max, conf = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])), float(line[5])
@@ -146,7 +147,7 @@
                 x_max = min(frame.shape[1], x_max)
                 y_max = min(frame.shape[0], y_max)
                 # Crop image
-                img = frame[y_min:y_max,x_min:x_max]
+                img = cv2_frame[y_min:y_max,x_min:x_max]
                 img = Image.fromarray(img)
 
                 # Transform

--
Gitblit v1.8.0