import sys, os, argparse
|
|
import numpy as np
|
import cv2
|
import matplotlib.pyplot as plt
|
|
import torch
|
import torch.nn as nn
|
from torch.autograd import Variable
|
from torch.utils.data import DataLoader
|
from torchvision import transforms
|
import torch.backends.cudnn as cudnn
|
import torchvision
|
import torch.nn.functional as F
|
from PIL import Image
|
|
import datasets, hopenet, utils
|
|
from skimage import io
|
import dlib
|
import time
|
|
if __name__ == '__main__':
|
|
|
cudnn.enabled = True
|
|
batch_size = 1
|
gpu = 'cuda:0'
|
snapshot_path = './snapshots/hopenet_robust_alpha1.pkl'
|
out_dir = 'output/video'
|
|
|
if not os.path.exists(out_dir):
|
os.makedirs(out_dir)
|
|
# ResNet50 structure
|
model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
|
|
# Dlib face detection model
|
cnn_face_detector = dlib.cnn_face_detection_model_v1('./models/mmod_human_face_detector.dat')
|
|
print('Loading snapshot.')
|
# Load snapshot
|
saved_state_dict = torch.load(snapshot_path)
|
model.load_state_dict(saved_state_dict)
|
|
print ('Loading data.')
|
|
transformations = transforms.Compose([transforms.Scale(224),
|
transforms.CenterCrop(224), transforms.ToTensor(),
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
|
|
model.cuda(gpu)
|
|
print('Ready to test network.')
|
|
# Test the Model
|
model.eval() # Change model to 'eval' mode (BN uses moving mean/var).
|
total = 0
|
|
idx_tensor = [idx for idx in range(66)]
|
idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
|
|
video = cv2.VideoCapture('rtsp://admin:a1234567@192.168.1.215:554/H.264/ch1/main/av_stream')
|
if not video.isOpened():
|
print('not opened')
|
exit()
|
|
# New cv2
|
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float
|
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float
|
print('width=========='+str(width))
|
print('height=========='+str(height))
|
# Define the codec and create VideoWriter object
|
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
|
# out = cv2.VideoWriter('output/video/output-%s.avi' % 's', fourcc, 25, (width, height))
|
out = cv2.VideoWriter('output/video/output-%s.avi' % 's', fourcc, 25, (1024, 534))
|
# # Old cv2
|
# width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)) # float
|
# height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float
|
#
|
# # Define the codec and create VideoWriter object
|
# fourcc = cv2.cv.CV_FOURCC(*'MJPG')
|
# out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height))
|
|
txt_out = open('/home/basic/code/deep-head-pose/output/images/%s.txt' % 's', 'w')
|
|
frame_num = 1
|
start = time.time()
|
while frame_num <= 10000:
|
# print(frame_num)
|
|
|
ret,frame = video.read()
|
if ret == False:
|
break
|
|
frame = cv2.resize(frame, (1024, 534), interpolation=cv2.INTER_LINEAR)
|
if frame_num % 8 != 0:
|
frame_num += 1
|
cv2.imshow("frame", frame)
|
if cv2.waitKey(1) & 0xFF == ord('q'):
|
break
|
# out.write(frame)
|
continue
|
cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
|
|
|
# Dlib detect
|
dets = cnn_face_detector(cv2_frame, 1)
|
|
for idx, det in enumerate(dets):
|
# print('**********************yes****************************yes********************************yes************************************')
|
# Get x_min, y_min, x_max, y_max, conf
|
x_min = det.rect.left()
|
y_min = det.rect.top()
|
x_max = det.rect.right()
|
y_max = det.rect.bottom()
|
conf = det.confidence
|
|
if conf > 1.0:
|
bbox_width = abs(x_max - x_min)
|
bbox_height = abs(y_max - y_min)
|
x_min -= 2 * bbox_width / 4
|
x_max += 2 * bbox_width / 4
|
y_min -= 3 * bbox_height / 4
|
y_max += bbox_height / 4
|
x_min = max(x_min, 0); y_min = max(y_min, 0)
|
x_max = min(frame.shape[1], x_max); y_max = min(frame.shape[0], y_max)
|
# Crop image
|
img = cv2_frame[int(y_min):int(y_max),int(x_min):int(x_max)]
|
img = Image.fromarray(img)
|
|
# Transform
|
img = transformations(img)
|
img_shape = img.size()
|
img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
|
img = Variable(img).cuda(gpu)
|
|
yaw, pitch, roll = model(img)
|
|
yaw_predicted = F.softmax(yaw)
|
pitch_predicted = F.softmax(pitch)
|
roll_predicted = F.softmax(roll)
|
# Get continuous predictions in degrees.
|
yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
|
pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
|
roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
|
|
# Print new frame with cube and axis
|
txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted))
|
# utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width)
|
# utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
|
utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max)/2, tdy= (y_min + y_max)/2, size = bbox_height*3)
|
# Plot expanded bounding box
|
# cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
|
cv2.imshow("frame", frame)
|
if cv2.waitKey(1) & 0xFF == ord('q'):
|
break
|
# out.write(frame)
|
frame_num += 1
|
end = time.time()
|
|
print(end - start)
|
out.release()
|
video.release()
|
# cv2.destroyAllWindows()
|