Algorithm/deepHeadPose.git

parent: b215992b | 补丁 | 提交 | show whitespace

Temperature softmax and 10 shape PCA regression.

natanielruiz

2017-08-12 2eb13d63b15a8ac908d6fa324c7f3d19141ca570

Temperature softmax and 10 shape PCA regression.

4个文件已修改

2个文件已添加

	code/hopenet.py	26 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	code/test_resnet_bins.py	16 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	code/test_resnet_shape.py	145 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	code/train_resnet_shape.py	53 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	code/utils.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	practice/aflw_example.py	133 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 code/hopenet.py

@@ -106,7 +106,7 @@
    # This is just Hopenet with 3 output layers for yaw, pitch and roll.
    def __init__(self, block, layers, num_bins, shape_bins):
        self.inplanes = 64
        super(Hopenet, self).__init__()
        super(Hopenet_shape, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
@@ -120,7 +120,16 @@
        self.fc_yaw = nn.Linear(512 * block.expansion, num_bins)
        self.fc_pitch = nn.Linear(512 * block.expansion, num_bins)
        self.fc_roll = nn.Linear(512 * block.expansion, num_bins)
        self.fc_shape_0 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_1 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_2 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_3 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_4 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_5 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_6 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_7 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_8 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_9 = nn.Linear(512 * block.expansion, shape_bins)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
@@ -163,6 +172,17 @@
        yaw = self.fc_yaw(x)
        pitch = self.fc_pitch(x)
        roll = self.fc_roll(x)
        shape_1 = self.fc_shape_1(x)

        return yaw, pitch, roll, shape_1
        shape = []
        shape.append(self.fc_shape_0(x))
        shape.append(self.fc_shape_1(x))
        shape.append(self.fc_shape_2(x))
        shape.append(self.fc_shape_3(x))
        shape.append(self.fc_shape_4(x))
        shape.append(self.fc_shape_5(x))
        shape.append(self.fc_shape_6(x))
        shape.append(self.fc_shape_7(x))
        shape.append(self.fc_shape_8(x))
        shape.append(self.fc_shape_9(x))

        return yaw, pitch, roll, shape

 code/test_resnet_bins.py

@@ -103,18 +103,14 @@
        _, pitch_bpred = torch.max(pitch.data, 1)
        _, roll_bpred = torch.max(roll.data, 1)

        yaw_predicted = F.softmax(yaw)
        pitch_predicted = F.softmax(pitch)
        roll_predicted = F.softmax(roll)

        # Continuous predictions
        yaw_predicted = torch.sum(yaw_predicted.data * idx_tensor, 1)
        pitch_predicted = torch.sum(pitch_predicted.data * idx_tensor, 1)
        roll_predicted = torch.sum(roll_predicted.data * idx_tensor, 1)
        yaw_predicted = utils.softmax_temperature(yaw.data, 1)
        pitch_predicted = utils.softmax_temperature(pitch.data, 1)
        roll_predicted = utils.softmax_temperature(roll.data, 1)

        yaw_predicted = yaw_predicted.cpu()
        pitch_predicted = pitch_predicted.cpu()
        roll_predicted = roll_predicted.cpu()
        yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu()
        pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu()
        roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu()

        # Mean absolute error
        yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw) * 3)

 code/test_resnet_shape.py

New file
@@ -0,0 +1,145 @@
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.backends.cudnn as cudnn
import torchvision
import torch.nn.functional as F

import cv2
import matplotlib.pyplot as plt
import sys
import os
import argparse

import datasets
import hopenet
import utils

def parse_args():
    """Parse input arguments."""
    parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
    parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
            default=0, type=int)
    parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
          default='', type=str)
    parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
          default='', type=str)
    parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.',
          default='', type=str)
    parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
          default=1, type=int)
    parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.',
          default=False, type=bool)

    args = parser.parse_args()

    return args

if __name__ == '__main__':
    args = parse_args()

    cudnn.enabled = True
    gpu = args.gpu_id
    snapshot_path = os.path.join('output/snapshots', args.snapshot + '.pkl')

    # ResNet101 with 3 outputs.
    # model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 23, 3], 66)
    # ResNet50
    model = hopenet.Hopenet_shape(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66, 60)
    # ResNet18
    # model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66)

    print 'Loading snapshot.'
    # Load snapshot
    saved_state_dict = torch.load(snapshot_path)
    model.load_state_dict(saved_state_dict)

    print 'Loading data.'

    transformations = transforms.Compose([transforms.Scale(224),
    transforms.RandomCrop(224), transforms.ToTensor()])

    pose_dataset = datasets.AFLW2000_binned(args.data_dir, args.filename_list,
                                transformations)
    test_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
                                               batch_size=args.batch_size,
                                               num_workers=2)

    model.cuda(gpu)

    print 'Ready to test network.'

    # Test the Model
    model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
    total = 0
    n_margins = 20
    yaw_correct = np.zeros(n_margins)
    pitch_correct = np.zeros(n_margins)
    roll_correct = np.zeros(n_margins)

    idx_tensor = [idx for idx in xrange(66)]
    idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)

    yaw_error = .0
    pitch_error = .0
    roll_error = .0

    l1loss = torch.nn.L1Loss(size_average=False)

    for i, (images, labels, name) in enumerate(test_loader):
        images = Variable(images).cuda(gpu)
        total += labels.size(0)
        label_yaw = labels[:,0].float()
        label_pitch = labels[:,1].float()
        label_roll = labels[:,2].float()

        yaw, pitch, roll, shape = model(images)

        # Binned predictions
        _, yaw_bpred = torch.max(yaw.data, 1)
        _, pitch_bpred = torch.max(pitch.data, 1)
        _, roll_bpred = torch.max(roll.data, 1)

        # Continuous predictions
        yaw_predicted = utils.softmax_temperature(yaw.data, 1)
        pitch_predicted = utils.softmax_temperature(pitch.data, 1)
        roll_predicted = utils.softmax_temperature(roll.data, 1)

        yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu()
        pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu()
        roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu()

        # Mean absolute error
        yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw) * 3)
        pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch) * 3)
        roll_error += torch.sum(torch.abs(roll_predicted - label_roll) * 3)

        # Binned Accuracy
        # for er in xrange(n_margins):
        #     yaw_bpred[er] += (label_yaw[0] in range(yaw_bpred[0,0] - er, yaw_bpred[0,0] + er + 1))
        #     pitch_bpred[er] += (label_pitch[0] in range(pitch_bpred[0,0] - er, pitch_bpred[0,0] + er + 1))
        #     roll_bpred[er] += (label_roll[0] in range(roll_bpred[0,0] - er, roll_bpred[0,0] + er + 1))

        # print label_yaw[0], yaw_bpred[0,0]

        # Save images with pose cube.
        # TODO: fix for larger batch size
        if args.save_viz:
            name = name[0]
            cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg'))
            #print os.path.join('output/images', name + '.jpg')
            #print label_yaw[0] * 3 - 99, label_pitch[0] * 3 - 99, label_roll[0] * 3 - 99
            #print yaw_predicted * 3 - 99, pitch_predicted * 3 - 99, roll_predicted * 3 - 99
            utils.plot_pose_cube(cv2_img, yaw_predicted[0] * 3 - 99, pitch_predicted[0] * 3 - 99, roll_predicted[0] * 3 - 99)
            cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img)

    print('Test error in degrees of the model on the ' + str(total) +
    ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total,
    pitch_error / total, roll_error / total))

    # Binned accuracy
    # for idx in xrange(len(yaw_correct)):
    #     print yaw_correct[idx] / total, pitch_correct[idx] / total, roll_correct[idx] / total

 code/train_resnet_shape.py

@@ -66,7 +66,17 @@
    b.append(model.fc_yaw)
    b.append(model.fc_pitch)
    b.append(model.fc_roll)
    b.append(model.fc_shape_0)
    b.append(model.fc_shape_1)
    b.append(model.fc_shape_2)
    b.append(model.fc_shape_3)
    b.append(model.fc_shape_4)
    b.append(model.fc_shape_5)
    b.append(model.fc_shape_6)
    b.append(model.fc_shape_7)
    b.append(model.fc_shape_8)
    b.append(model.fc_shape_9)

    for i in range(len(b)):
        for j in b[i].modules():
            for k in j.parameters():
@@ -96,7 +106,7 @@
    # ResNet101 with 3 outputs
    # model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 23, 3], 66)
    # ResNet50
    model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
    model = hopenet.Hopenet_shape(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66, 60)
    # ResNet18
    # model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66)
    load_filtered_state_dict(model, model_zoo.load_url(model_urls['resnet50']))
@@ -114,8 +124,8 @@
                                               num_workers=2)

    model.cuda(gpu)
    criterion = nn.CrossEntropyLoss()
    reg_criterion = nn.MSELoss()
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    reg_criterion = nn.MSELoss().cuda(gpu)
    # Regression loss coefficient
    alpha = 0.1
    lsm = nn.Softmax()
@@ -124,21 +134,23 @@
    idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)

    optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': args.lr},
                                  {'params': get_non_ignored_params(model), 'lr': args.lr * 10}],
                                  {'params': get_non_ignored_params(model), 'lr': args.lr}],
                                  lr = args.lr)

    print 'Ready to train network.'

    for epoch in range(num_epochs):
        for i, (images, labels, name) in enumerate(train_loader):
            images = Variable(images).cuda(gpu)
            label_yaw = Variable(labels[:,0]).cuda(gpu)
            label_pitch = Variable(labels[:,1]).cuda(gpu)
            label_roll = Variable(labels[:,2]).cuda(gpu)
            label_shape_1 = Variable(labels[:,3]).cuda(gpu)
            images = Variable(images.cuda(gpu))
            label_yaw = Variable(labels[:,0].cuda(gpu))
            label_pitch = Variable(labels[:,1].cuda(gpu))
            label_roll = Variable(labels[:,2].cuda(gpu))
            label_shape = Variable(labels[:,3:].cuda(gpu))

            optimizer.zero_grad()
            yaw, pitch, roll, shape_1 = model(images)
            model.zero_grad()

            yaw, pitch, roll, shape = model(images)

            # Cross entropy loss
            loss_yaw = criterion(yaw, label_yaw)
@@ -158,17 +170,18 @@
            loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch.float())
            loss_reg_roll = reg_criterion(roll_predicted, label_roll.float())

            # Shape space loss
            loss_shape_1 = criterion(shape_1, label_shape_1)

            # Total loss
            loss_yaw += alpha * loss_reg_yaw
            loss_pitch += alpha * loss_reg_pitch
            loss_roll += alpha * loss_reg_roll

            loss_seq = [loss_yaw, loss_pitch, loss_roll, loss_shape_1]
            loss_seq = [loss_yaw, loss_pitch, loss_roll]

            # Shape space loss
            for idx in xrange(len(shape)):
                loss_seq.append(criterion(shape[idx], label_shape[:,idx]))

            grad_seq = [torch.Tensor(1).cuda(gpu) for _ in range(len(loss_seq))]
            model.zero_grad()
            torch.autograd.backward(loss_seq, grad_seq)
            optimizer.step()

@@ -176,17 +189,17 @@
            #        %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))

            if (i+1) % 100 == 0:
                print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f'
                       %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))
                print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f, Shape %.4f'
                       %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0], loss_seq[3].data[0]))
                if epoch == 0:
                    torch.save(model.state_dict(),
                    'output/snapshots/resnet50_iter_'+ str(i+1) + '.pkl')
                    'output/snapshots/resnet50_shape_iter_'+ str(i+1) + '.pkl')

        # Save models at numbered epochs.
        if epoch % 1 == 0 and epoch < num_epochs - 1:
            print 'Taking snapshot...'
            torch.save(model.state_dict(),
            'output/snapshots/resnet50_epoch_'+ str(epoch+1) + '.pkl')
            'output/snapshots/resnet50_shape_epoch_'+ str(epoch+1) + '.pkl')

    # Save the final Trained Model
    torch.save(model.state_dict(), 'output/snapshots/resnet50_epoch_' + str(epoch+1) + '.pkl')
    torch.save(model.state_dict(), 'output/snapshots/resnet50_shape_epoch_' + str(epoch+1) + '.pkl')

 code/utils.py

@@ -7,6 +7,11 @@
import math
from math import cos, sin

def softmax_temperature(tensor, temperature):
    result = torch.exp(tensor / temperature)
    result = torch.div(result, torch.sum(result, 1).unsqueeze(1).expand_as(result))
    return result

def get_pose_params_from_mat(mat_path):
    # This functions gets the pose parameters from the .mat
    # Annotations that come with the 300W_LP dataset.

 practice/aflw_example.py

New file
@@ -0,0 +1,133 @@
#!/usr/bin/env python

##
# Massimiliano Patacchiola, Plymouth University 2016
# website: http://mpatacchiola.github.io/
# email: massimiliano.patacchiola@plymouth.ac.uk
# Python code for information retrieval from the Annotated Facial Landmarks in the Wild (AFLW) dataset.
# In this example the faces are isolated and saved in a specified output folder.
# Some information (roll, pitch, yaw) are returned, they can be used to filter the images.
# This code requires OpenCV and Numpy. You can easily bypass the OpenCV calls if you want to use
# a different library. In order to use the code you have to unzip the images and store them in
# the directory "flickr" mantaining the original folders name (0, 2, 3).
#
# The following are the database properties available (last updated version 2012-11-28):
#
# databases: db_id, path, description
# faceellipse: face_id, x, y, ra, rb, theta, annot_type_id, upsidedown
# faceimages: image_id, db_id, file_id, filepath, bw, widht, height
# facemetadata: face_id, sex, occluded, glasses, bw, annot_type_id
# facepose: face_id, roll, pitch, yaw, annot_type_id
# facerect: face_id, x, y, w, h, annot_type_id
# faces: face_id, file_id, db_id
# featurecoords: face_id, feature_id, x, y
# featurecoordtype: feature_id, descr, code, x, y, z

import sqlite3
import cv2
import os.path
import numpy as np

#Change this paths according to your directories
images_path = "./flickr/"
storing_path = "./output/"

def main():

    #Image counter
    counter = 1

    #Open the sqlite database
    conn = sqlite3.connect('aflw.sqlite')
    c = conn.cursor()

    #Creating the query string for retriving: roll, pitch, yaw and faces position
    #Change it according to what you want to retrieve
    select_string = "faceimages.filepath, faces.face_id, facepose.roll, facepose.pitch, facepose.yaw, facerect.x, facerect.y, facerect.w, facerect.h"
    from_string = "faceimages, faces, facepose, facerect"
    where_string = "faces.face_id = facepose.face_id and faces.file_id = faceimages.file_id and faces.face_id = facerect.face_id"
    query_string = "SELECT " + select_string + " FROM " + from_string + " WHERE " + where_string

    #It iterates through the rows returned from the query
    for row in c.execute(query_string):

        #Using our specific query_string, the "row" variable will contain:
        # row[0] = image path
        # row[1] = face id
        # row[2] = roll
        # row[3] = pitch
        # row[4] = yaw
        # row[5] = face coord x
        # row[6] = face coord y
        # row[7] = face width
        # row[8] = face heigh

        #Creating the full path names for input and output
        input_path = images_path + str(row[0])
        output_path = storing_path + str(row[0])

        #If the file exist then open it       
        if(os.path.isfile(input_path)  == True):
            #image = cv2.imread(input_path, 0) #load in grayscale
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #load the colour version

            #Image dimensions
            image_h, image_w = image.shape
            #Roll, pitch and yaw
            roll   = row[2]
            pitch  = row[3]
            yaw    = row[4]
            #Face rectangle coords
            face_x = row[5]
            face_y = row[6]
            face_w = row[7]
            face_h = row[8]

            #Error correction
            if(face_x < 0): face_x = 0
            if(face_y < 0): face_y = 0
            if(face_w > image_w): 
                face_w = image_w
                face_h = image_w
            if(face_h > image_h): 
                face_h = image_h
                face_w = image_h

            #Crop the face from the image
            image_cropped = np.copy(image[face_y:face_y+face_h, face_x:face_x+face_w])
            #Uncomment the lines below if you want to rescale the image to a particular size
            #to_size = 64
            #image_rescaled = cv2.resize(image_cropped, (to_size,to_size), interpolation = cv2.INTER_AREA)
            #Uncomment the line below if you want to use adaptive histogram normalisation
            #clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(5,5))
            #image_normalised = clahe.apply(image_rescaled)
            #Save the image
            #change "image_cropped" with the last uncommented variable name above
            cv2.imwrite(output_path, image_cropped)

            #Printing the information
            print "Counter: " + str(counter)
            print "iPath:    " + input_path
            print "oPath:    " + output_path
            print "Roll:    " + str(roll)
            print "Pitch:   " + str(pitch)
            print "Yaw:     " + str(yaw)
            print "x:       " + str(face_x)
            print "y:       " + str(face_y)
            print "w:       " + str(face_w)
            print "h:       " + str(face_h)
            print ""

            #Increasing the counter
            counter = counter + 1 

        #if the file does not exits it return an exception
        else:
            raise ValueError('Error: I cannot find the file specified: ' + str(input_path))

    #Once finished the iteration it closes the database
    c.close()

if __name__ == "__main__":
    main()

			@@ -106,7 +106,7 @@
			# This is just Hopenet with 3 output layers for yaw, pitch and roll.
			def __init__(self, block, layers, num_bins, shape_bins):
			self.inplanes = 64
			super(Hopenet, self).__init__()
			super(Hopenet_shape, self).__init__()
			self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
			bias=False)
			self.bn1 = nn.BatchNorm2d(64)
			@@ -120,7 +120,16 @@
			self.fc_yaw = nn.Linear(512 * block.expansion, num_bins)
			self.fc_pitch = nn.Linear(512 * block.expansion, num_bins)
			self.fc_roll = nn.Linear(512 * block.expansion, num_bins)
			self.fc_shape_0 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_1 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_2 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_3 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_4 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_5 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_6 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_7 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_8 = nn.Linear(512 * block.expansion, shape_bins)
			self.fc_shape_9 = nn.Linear(512 * block.expansion, shape_bins)

			for m in self.modules():
			if isinstance(m, nn.Conv2d):
			@@ -163,6 +172,17 @@
			yaw = self.fc_yaw(x)
			pitch = self.fc_pitch(x)
			roll = self.fc_roll(x)
			shape_1 = self.fc_shape_1(x)

			return yaw, pitch, roll, shape_1
			shape = []
			shape.append(self.fc_shape_0(x))
			shape.append(self.fc_shape_1(x))
			shape.append(self.fc_shape_2(x))
			shape.append(self.fc_shape_3(x))
			shape.append(self.fc_shape_4(x))
			shape.append(self.fc_shape_5(x))
			shape.append(self.fc_shape_6(x))
			shape.append(self.fc_shape_7(x))
			shape.append(self.fc_shape_8(x))
			shape.append(self.fc_shape_9(x))

			return yaw, pitch, roll, shape

			@@ -103,18 +103,14 @@
			_, pitch_bpred = torch.max(pitch.data, 1)
			_, roll_bpred = torch.max(roll.data, 1)

			yaw_predicted = F.softmax(yaw)
			pitch_predicted = F.softmax(pitch)
			roll_predicted = F.softmax(roll)

			# Continuous predictions
			yaw_predicted = torch.sum(yaw_predicted.data * idx_tensor, 1)
			pitch_predicted = torch.sum(pitch_predicted.data * idx_tensor, 1)
			roll_predicted = torch.sum(roll_predicted.data * idx_tensor, 1)
			yaw_predicted = utils.softmax_temperature(yaw.data, 1)
			pitch_predicted = utils.softmax_temperature(pitch.data, 1)
			roll_predicted = utils.softmax_temperature(roll.data, 1)

			yaw_predicted = yaw_predicted.cpu()
			pitch_predicted = pitch_predicted.cpu()
			roll_predicted = roll_predicted.cpu()
			yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu()
			pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu()
			roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu()

			# Mean absolute error
			yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw) * 3)

New file
			@@ -0,0 +1,145 @@
			import numpy as np
			import torch
			import torch.nn as nn
			from torch.autograd import Variable
			from torch.utils.data import DataLoader
			from torchvision import transforms
			import torch.backends.cudnn as cudnn
			import torchvision
			import torch.nn.functional as F

			import cv2
			import matplotlib.pyplot as plt
			import sys
			import os
			import argparse

			import datasets
			import hopenet
			import utils

			def parse_args():
			"""Parse input arguments."""
			parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
			parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
			default=0, type=int)
			parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
			default='', type=str)
			parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
			default='', type=str)
			parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.',
			default='', type=str)
			parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
			default=1, type=int)
			parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.',
			default=False, type=bool)

			args = parser.parse_args()

			return args

			if __name__ == '__main__':
			args = parse_args()

			cudnn.enabled = True
			gpu = args.gpu_id
			snapshot_path = os.path.join('output/snapshots', args.snapshot + '.pkl')

			# ResNet101 with 3 outputs.
			# model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 23, 3], 66)
			# ResNet50
			model = hopenet.Hopenet_shape(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66, 60)
			# ResNet18
			# model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66)

			print 'Loading snapshot.'
			# Load snapshot
			saved_state_dict = torch.load(snapshot_path)
			model.load_state_dict(saved_state_dict)

			print 'Loading data.'

			transformations = transforms.Compose([transforms.Scale(224),
			transforms.RandomCrop(224), transforms.ToTensor()])

			pose_dataset = datasets.AFLW2000_binned(args.data_dir, args.filename_list,
			transformations)
			test_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
			batch_size=args.batch_size,
			num_workers=2)

			model.cuda(gpu)

			print 'Ready to test network.'

			# Test the Model
			model.eval() # Change model to 'eval' mode (BN uses moving mean/var).
			total = 0
			n_margins = 20
			yaw_correct = np.zeros(n_margins)
			pitch_correct = np.zeros(n_margins)
			roll_correct = np.zeros(n_margins)

			idx_tensor = [idx for idx in xrange(66)]
			idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)

			yaw_error = .0
			pitch_error = .0
			roll_error = .0

			l1loss = torch.nn.L1Loss(size_average=False)

			for i, (images, labels, name) in enumerate(test_loader):
			images = Variable(images).cuda(gpu)
			total += labels.size(0)
			label_yaw = labels[:,0].float()
			label_pitch = labels[:,1].float()
			label_roll = labels[:,2].float()

			yaw, pitch, roll, shape = model(images)

			# Binned predictions
			_, yaw_bpred = torch.max(yaw.data, 1)
			_, pitch_bpred = torch.max(pitch.data, 1)
			_, roll_bpred = torch.max(roll.data, 1)

			# Continuous predictions
			yaw_predicted = utils.softmax_temperature(yaw.data, 1)
			pitch_predicted = utils.softmax_temperature(pitch.data, 1)
			roll_predicted = utils.softmax_temperature(roll.data, 1)

			yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu()
			pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu()
			roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu()

			# Mean absolute error
			yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw) * 3)
			pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch) * 3)
			roll_error += torch.sum(torch.abs(roll_predicted - label_roll) * 3)

			# Binned Accuracy
			# for er in xrange(n_margins):
			# yaw_bpred[er] += (label_yaw[0] in range(yaw_bpred[0,0] - er, yaw_bpred[0,0] + er + 1))
			# pitch_bpred[er] += (label_pitch[0] in range(pitch_bpred[0,0] - er, pitch_bpred[0,0] + er + 1))
			# roll_bpred[er] += (label_roll[0] in range(roll_bpred[0,0] - er, roll_bpred[0,0] + er + 1))

			# print label_yaw[0], yaw_bpred[0,0]

			# Save images with pose cube.
			# TODO: fix for larger batch size
			if args.save_viz:
			name = name[0]
			cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg'))
			#print os.path.join('output/images', name + '.jpg')
			#print label_yaw[0] * 3 - 99, label_pitch[0] * 3 - 99, label_roll[0] * 3 - 99
			#print yaw_predicted * 3 - 99, pitch_predicted * 3 - 99, roll_predicted * 3 - 99
			utils.plot_pose_cube(cv2_img, yaw_predicted[0] * 3 - 99, pitch_predicted[0] * 3 - 99, roll_predicted[0] * 3 - 99)
			cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img)

			print('Test error in degrees of the model on the ' + str(total) +
			' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total,
			pitch_error / total, roll_error / total))

			# Binned accuracy
			# for idx in xrange(len(yaw_correct)):
			# print yaw_correct[idx] / total, pitch_correct[idx] / total, roll_correct[idx] / total

			@@ -66,7 +66,17 @@
			b.append(model.fc_yaw)
			b.append(model.fc_pitch)
			b.append(model.fc_roll)
			b.append(model.fc_shape_0)
			b.append(model.fc_shape_1)
			b.append(model.fc_shape_2)
			b.append(model.fc_shape_3)
			b.append(model.fc_shape_4)
			b.append(model.fc_shape_5)
			b.append(model.fc_shape_6)
			b.append(model.fc_shape_7)
			b.append(model.fc_shape_8)
			b.append(model.fc_shape_9)

			for i in range(len(b)):
			for j in b[i].modules():
			for k in j.parameters():
			@@ -96,7 +106,7 @@
			# ResNet101 with 3 outputs
			# model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 23, 3], 66)
			# ResNet50
			model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
			model = hopenet.Hopenet_shape(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66, 60)
			# ResNet18
			# model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66)
			load_filtered_state_dict(model, model_zoo.load_url(model_urls['resnet50']))
			@@ -114,8 +124,8 @@
			num_workers=2)

			model.cuda(gpu)
			criterion = nn.CrossEntropyLoss()
			reg_criterion = nn.MSELoss()
			criterion = nn.CrossEntropyLoss().cuda(gpu)
			reg_criterion = nn.MSELoss().cuda(gpu)
			# Regression loss coefficient
			alpha = 0.1
			lsm = nn.Softmax()
			@@ -124,21 +134,23 @@
			idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)

			optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': args.lr},
			{'params': get_non_ignored_params(model), 'lr': args.lr * 10}],
			{'params': get_non_ignored_params(model), 'lr': args.lr}],
			lr = args.lr)

			print 'Ready to train network.'

			for epoch in range(num_epochs):
			for i, (images, labels, name) in enumerate(train_loader):
			images = Variable(images).cuda(gpu)
			label_yaw = Variable(labels[:,0]).cuda(gpu)
			label_pitch = Variable(labels[:,1]).cuda(gpu)
			label_roll = Variable(labels[:,2]).cuda(gpu)
			label_shape_1 = Variable(labels[:,3]).cuda(gpu)
			images = Variable(images.cuda(gpu))
			label_yaw = Variable(labels[:,0].cuda(gpu))
			label_pitch = Variable(labels[:,1].cuda(gpu))
			label_roll = Variable(labels[:,2].cuda(gpu))
			label_shape = Variable(labels[:,3:].cuda(gpu))

			optimizer.zero_grad()
			yaw, pitch, roll, shape_1 = model(images)
			model.zero_grad()

			yaw, pitch, roll, shape = model(images)

			# Cross entropy loss
			loss_yaw = criterion(yaw, label_yaw)
			@@ -158,17 +170,18 @@
			loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch.float())
			loss_reg_roll = reg_criterion(roll_predicted, label_roll.float())

			# Shape space loss
			loss_shape_1 = criterion(shape_1, label_shape_1)

			# Total loss
			loss_yaw += alpha * loss_reg_yaw
			loss_pitch += alpha * loss_reg_pitch
			loss_roll += alpha * loss_reg_roll

			loss_seq = [loss_yaw, loss_pitch, loss_roll, loss_shape_1]
			loss_seq = [loss_yaw, loss_pitch, loss_roll]

			# Shape space loss
			for idx in xrange(len(shape)):
			loss_seq.append(criterion(shape[idx], label_shape[:,idx]))

			grad_seq = [torch.Tensor(1).cuda(gpu) for _ in range(len(loss_seq))]
			model.zero_grad()
			torch.autograd.backward(loss_seq, grad_seq)
			optimizer.step()

			@@ -176,17 +189,17 @@
			# %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))

			if (i+1) % 100 == 0:
			print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f'
			%(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))
			print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f, Shape %.4f'
			%(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0], loss_seq[3].data[0]))
			if epoch == 0:
			torch.save(model.state_dict(),
			'output/snapshots/resnet50_iter_'+ str(i+1) + '.pkl')
			'output/snapshots/resnet50_shape_iter_'+ str(i+1) + '.pkl')

			# Save models at numbered epochs.
			if epoch % 1 == 0 and epoch < num_epochs - 1:
			print 'Taking snapshot...'
			torch.save(model.state_dict(),
			'output/snapshots/resnet50_epoch_'+ str(epoch+1) + '.pkl')
			'output/snapshots/resnet50_shape_epoch_'+ str(epoch+1) + '.pkl')

			# Save the final Trained Model
			torch.save(model.state_dict(), 'output/snapshots/resnet50_epoch_' + str(epoch+1) + '.pkl')
			torch.save(model.state_dict(), 'output/snapshots/resnet50_shape_epoch_' + str(epoch+1) + '.pkl')

			@@ -7,6 +7,11 @@
			import math
			from math import cos, sin

			def softmax_temperature(tensor, temperature):
			result = torch.exp(tensor / temperature)
			result = torch.div(result, torch.sum(result, 1).unsqueeze(1).expand_as(result))
			return result

			def get_pose_params_from_mat(mat_path):
			# This functions gets the pose parameters from the .mat
			# Annotations that come with the 300W_LP dataset.

New file
			@@ -0,0 +1,133 @@
			#!/usr/bin/env python

			##
			# Massimiliano Patacchiola, Plymouth University 2016
			# website: http://mpatacchiola.github.io/
			# email: massimiliano.patacchiola@plymouth.ac.uk
			# Python code for information retrieval from the Annotated Facial Landmarks in the Wild (AFLW) dataset.
			# In this example the faces are isolated and saved in a specified output folder.
			# Some information (roll, pitch, yaw) are returned, they can be used to filter the images.
			# This code requires OpenCV and Numpy. You can easily bypass the OpenCV calls if you want to use
			# a different library. In order to use the code you have to unzip the images and store them in
			# the directory "flickr" mantaining the original folders name (0, 2, 3).
			#
			# The following are the database properties available (last updated version 2012-11-28):
			#
			# databases: db_id, path, description
			# faceellipse: face_id, x, y, ra, rb, theta, annot_type_id, upsidedown
			# faceimages: image_id, db_id, file_id, filepath, bw, widht, height
			# facemetadata: face_id, sex, occluded, glasses, bw, annot_type_id
			# facepose: face_id, roll, pitch, yaw, annot_type_id
			# facerect: face_id, x, y, w, h, annot_type_id
			# faces: face_id, file_id, db_id
			# featurecoords: face_id, feature_id, x, y
			# featurecoordtype: feature_id, descr, code, x, y, z

			import sqlite3
			import cv2
			import os.path
			import numpy as np

			#Change this paths according to your directories
			images_path = "./flickr/"
			storing_path = "./output/"

			def main():

			#Image counter
			counter = 1

			#Open the sqlite database
			conn = sqlite3.connect('aflw.sqlite')
			c = conn.cursor()

			#Creating the query string for retriving: roll, pitch, yaw and faces position
			#Change it according to what you want to retrieve
			select_string = "faceimages.filepath, faces.face_id, facepose.roll, facepose.pitch, facepose.yaw, facerect.x, facerect.y, facerect.w, facerect.h"
			from_string = "faceimages, faces, facepose, facerect"
			where_string = "faces.face_id = facepose.face_id and faces.file_id = faceimages.file_id and faces.face_id = facerect.face_id"
			query_string = "SELECT " + select_string + " FROM " + from_string + " WHERE " + where_string

			#It iterates through the rows returned from the query
			for row in c.execute(query_string):

			#Using our specific query_string, the "row" variable will contain:
			# row[0] = image path
			# row[1] = face id
			# row[2] = roll
			# row[3] = pitch
			# row[4] = yaw
			# row[5] = face coord x
			# row[6] = face coord y
			# row[7] = face width
			# row[8] = face heigh

			#Creating the full path names for input and output
			input_path = images_path + str(row[0])
			output_path = storing_path + str(row[0])

			#If the file exist then open it
			if(os.path.isfile(input_path) == True):
			#image = cv2.imread(input_path, 0) #load in grayscale
			image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #load the colour version

			#Image dimensions
			image_h, image_w = image.shape
			#Roll, pitch and yaw
			roll = row[2]
			pitch = row[3]
			yaw = row[4]
			#Face rectangle coords
			face_x = row[5]
			face_y = row[6]
			face_w = row[7]
			face_h = row[8]

			#Error correction
			if(face_x < 0): face_x = 0
			if(face_y < 0): face_y = 0
			if(face_w > image_w):
			face_w = image_w
			face_h = image_w
			if(face_h > image_h):
			face_h = image_h
			face_w = image_h

			#Crop the face from the image
			image_cropped = np.copy(image[face_y:face_y+face_h, face_x:face_x+face_w])
			#Uncomment the lines below if you want to rescale the image to a particular size
			#to_size = 64
			#image_rescaled = cv2.resize(image_cropped, (to_size,to_size), interpolation = cv2.INTER_AREA)
			#Uncomment the line below if you want to use adaptive histogram normalisation
			#clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(5,5))
			#image_normalised = clahe.apply(image_rescaled)
			#Save the image
			#change "image_cropped" with the last uncommented variable name above
			cv2.imwrite(output_path, image_cropped)

			#Printing the information
			print "Counter: " + str(counter)
			print "iPath: " + input_path
			print "oPath: " + output_path
			print "Roll: " + str(roll)
			print "Pitch: " + str(pitch)
			print "Yaw: " + str(yaw)
			print "x: " + str(face_x)
			print "y: " + str(face_y)
			print "w: " + str(face_w)
			print "h: " + str(face_h)
			print ""

			#Increasing the counter
			counter = counter + 1

			#if the file does not exits it return an exception
			else:
			raise ValueError('Error: I cannot find the file specified: ' + str(input_path))

			#Once finished the iteration it closes the database
			c.close()

			if __name__ == "__main__":
			main()