New file |
| | |
| | | import numpy as np |
| | | import torch |
| | | import torch.nn as nn |
| | | from torch.autograd import Variable |
| | | from torch.utils.data import DataLoader |
| | | from torchvision import transforms |
| | | import torch.backends.cudnn as cudnn |
| | | import torchvision |
| | | import torch.nn.functional as F |
| | | from PIL import Image |
| | | |
| | | import cv2 |
| | | import matplotlib.pyplot as plt |
| | | import sys, os, argparse |
| | | |
| | | import datasets, hopenet, utils |
| | | |
| | | def parse_args(): |
| | | """Parse input arguments.""" |
| | | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') |
| | | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', |
| | | default=0, type=int) |
| | | parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', |
| | | default='', type=str) |
| | | parser.add_argument('--video', dest='video_path', help='Path of video') |
| | | parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames') |
| | | parser.add_argument('--output_string', dest='output_string', help='String appended to output file') |
| | | args = parser.parse_args() |
| | | return args |
| | | |
| | | if __name__ == '__main__': |
| | | args = parse_args() |
| | | |
| | | cudnn.enabled = True |
| | | |
| | | batch_size = 1 |
| | | gpu = args.gpu_id |
| | | snapshot_path = args.snapshot |
| | | out_dir = 'output/video' |
| | | video_path = args.video_path |
| | | |
| | | if not os.path.exists(out_dir): |
| | | os.makedirs(out_dir) |
| | | |
| | | if not os.path.exists(args.video_path): |
| | | sys.exit('Video does not exist') |
| | | |
| | | # ResNet50 with 3 outputs. |
| | | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) |
| | | # ResNet18 |
| | | # model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66) |
| | | |
| | | print 'Loading snapshot.' |
| | | # Load snapshot |
| | | saved_state_dict = torch.load(snapshot_path) |
| | | model.load_state_dict(saved_state_dict) |
| | | |
| | | print 'Loading data.' |
| | | |
| | | transformations = transforms.Compose([transforms.Scale(224), |
| | | transforms.RandomCrop(224), transforms.ToTensor()]) |
| | | |
| | | model.cuda(gpu) |
| | | |
| | | print 'Ready to test network.' |
| | | |
| | | # Test the Model |
| | | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). |
| | | total = 0 |
| | | |
| | | idx_tensor = [idx for idx in xrange(66)] |
| | | idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) |
| | | |
| | | video = cv2.VideoCapture(video_path) |
| | | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float |
| | | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float |
| | | |
| | | # Define the codec and create VideoWriter object |
| | | fourcc = cv2.VideoWriter_fourcc(*'MJPG') |
| | | out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height)) |
| | | |
| | | bbox_file = open(args.bboxes, 'r') |
| | | frame_num = 1 |
| | | |
| | | # TODO: support for several bounding boxes |
| | | for line in bbox_file: |
| | | line = line.strip('\n') |
| | | line = line.split(' ') |
| | | det_frame_num = int(line[0]) |
| | | |
| | | print frame_num |
| | | |
| | | # Stop at a certain frame number |
| | | if frame_num > 10000: |
| | | out.release() |
| | | video.release() |
| | | bbox_file.close() |
| | | sys.exit(0) |
| | | |
| | | # Save all frames as they are if they don't have bbox annotation. |
| | | while frame_num < det_frame_num: |
| | | ret, frame = video.read() |
| | | if ret == False: |
| | | out.release() |
| | | video.release() |
| | | bbox_file.close() |
| | | sys.exit(0) |
| | | out.write(frame) |
| | | frame_num += 1 |
| | | |
| | | ret,frame = video.read() |
| | | if ret == False: |
| | | out.release() |
| | | video.release() |
| | | bbox_file.close() |
| | | sys.exit(0) |
| | | |
| | | x_min, y_min, x_max, y_max = int(line[1]), int(line[2]), int(line[3]), int(line[4]) |
| | | # Crop image |
| | | img = frame[y_min:y_max,x_min:x_max] |
| | | img = Image.fromarray(img) |
| | | |
| | | # Transform |
| | | img = transformations(img) |
| | | img_shape = img.size() |
| | | img = img.view(1, img_shape[0], img_shape[1], img_shape[2]) |
| | | img = Variable(img).cuda(gpu) |
| | | yaw, pitch, roll = model(img) |
| | | |
| | | yaw_predicted = F.softmax(yaw) |
| | | pitch_predicted = F.softmax(pitch) |
| | | roll_predicted = F.softmax(roll) |
| | | # Get continuous predictions in degrees. |
| | | yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99 |
| | | pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99 |
| | | roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99 |
| | | |
| | | # Print new frame with cube and TODO: axis |
| | | utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = 200) |
| | | out.write(frame) |
| | | |
| | | frame_num += 1 |
| | | |
| | | while True: |
| | | ret, frame = video.read() |
| | | if ret == False: |
| | | out.release() |
| | | video.release() |
| | | bbox_file.close() |
| | | sys.exit(0) |
| | | out.write(frame) |
| | | frame_num += 1 |
| | |
| | | snapshot_path = os.path.join('output/snapshots', args.snapshot + '.pkl') |
| | | |
| | | # ResNet50 with 3 outputs. |
| | | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) |
| | | # model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66) |
| | | # model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) |
| | | model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66) |
| | | |
| | | print 'Loading snapshot.' |
| | | # Load snapshot |
| | |
| | | |
| | | for i, (images, labels, name) in enumerate(test_loader): |
| | | images = Variable(images).cuda(gpu) |
| | | |
| | | total += labels.size(0) |
| | | label_yaw = labels[:,0] |
| | | label_pitch = labels[:,1] |
| | |
| | | if args.save_viz: |
| | | name = name[0] |
| | | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg')) |
| | | #cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_RGB2BGR) |
| | | #print name |
| | | #print os.path.join('output/images', name + '.jpg') |
| | | #print label_yaw[0] * 3 - 99, label_pitch[0] * 3 - 99, label_roll[0] * 3 - 99 |
| | | #print yaw_predicted * 3 - 99, pitch_predicted * 3 - 99, roll_predicted * 3 - 99 |
| | | utils.plot_pose_cube(cv2_img, yaw_predicted * 3 - 99, pitch_predicted * 3 - 99, roll_predicted * 3 - 99) |
| | |
| | | |
| | | model.cuda(gpu) |
| | | criterion = nn.CrossEntropyLoss() |
| | | optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': args.lr}, |
| | | # optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': args.lr}, |
| | | # {'params': get_non_ignored_params(model), 'lr': args.lr * 10}], |
| | | # lr = args.lr) |
| | | # optimizer = torch.optim.SGD([{'params': get_ignored_params(model), 'lr': args.lr}, |
| | | # {'params': get_non_ignored_params(model), 'lr': args.lr}], |
| | | # lr = args.lr, momentum=0.9) |
| | | optimizer = torch.optim.RMSprop([{'params': get_ignored_params(model), 'lr': args.lr}, |
| | | {'params': get_non_ignored_params(model), 'lr': args.lr * 10}], |
| | | lr = args.lr) |
| | | |
| | |
| | | if epoch % 1 == 0 and epoch < num_epochs - 1: |
| | | print 'Taking snapshot...' |
| | | torch.save(model.state_dict(), |
| | | 'output/snapshots/resnet50_binned_epoch_' + str(epoch+1) + '.pkl') |
| | | 'output/snapshots/resnet50_binned_RMSprop_epoch_' + str(epoch+1) + '.pkl') |
| | | |
| | | # Save the final Trained Model |
| | | torch.save(model.state_dict(), 'output/snapshots/resnet50_binned_epoch_' + str(epoch+1) + '.pkl') |
| | | torch.save(model.state_dict(), 'output/snapshots/resnet50_binned_RMSprop_epoch_' + str(epoch+1) + '.pkl') |
| | |
| | | face_y = tdy - 0.50 * size |
| | | else: |
| | | height, width = img.shape[:2] |
| | | face_x = width / 2 - 0.15 - size |
| | | face_y = height / 2 - 0.15 - size |
| | | face_x = width / 2 - 0.5 * size |
| | | face_y = height / 2 - 0.5 * size |
| | | |
| | | x1 = size * (cos(y) * cos(r)) + face_x |
| | | y1 = size * (cos(p) * sin(r) + cos(r) * sin(p) * sin(y)) + face_y |