natanielruiz
2017-08-12 2eb13d63b15a8ac908d6fa324c7f3d19141ca570
Temperature softmax and 10 shape PCA regression.
2个文件已添加
4个文件已修改
378 ■■■■■ 已修改文件
code/hopenet.py 26 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
code/test_resnet_bins.py 16 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
code/test_resnet_shape.py 145 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
code/train_resnet_shape.py 53 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
code/utils.py 5 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
practice/aflw_example.py 133 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
code/hopenet.py
@@ -106,7 +106,7 @@
    # This is just Hopenet with 3 output layers for yaw, pitch and roll.
    def __init__(self, block, layers, num_bins, shape_bins):
        self.inplanes = 64
        super(Hopenet, self).__init__()
        super(Hopenet_shape, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
@@ -120,7 +120,16 @@
        self.fc_yaw = nn.Linear(512 * block.expansion, num_bins)
        self.fc_pitch = nn.Linear(512 * block.expansion, num_bins)
        self.fc_roll = nn.Linear(512 * block.expansion, num_bins)
        self.fc_shape_0 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_1 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_2 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_3 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_4 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_5 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_6 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_7 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_8 = nn.Linear(512 * block.expansion, shape_bins)
        self.fc_shape_9 = nn.Linear(512 * block.expansion, shape_bins)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
@@ -163,6 +172,17 @@
        yaw = self.fc_yaw(x)
        pitch = self.fc_pitch(x)
        roll = self.fc_roll(x)
        shape_1 = self.fc_shape_1(x)
        return yaw, pitch, roll, shape_1
        shape = []
        shape.append(self.fc_shape_0(x))
        shape.append(self.fc_shape_1(x))
        shape.append(self.fc_shape_2(x))
        shape.append(self.fc_shape_3(x))
        shape.append(self.fc_shape_4(x))
        shape.append(self.fc_shape_5(x))
        shape.append(self.fc_shape_6(x))
        shape.append(self.fc_shape_7(x))
        shape.append(self.fc_shape_8(x))
        shape.append(self.fc_shape_9(x))
        return yaw, pitch, roll, shape
code/test_resnet_bins.py
@@ -103,18 +103,14 @@
        _, pitch_bpred = torch.max(pitch.data, 1)
        _, roll_bpred = torch.max(roll.data, 1)
        yaw_predicted = F.softmax(yaw)
        pitch_predicted = F.softmax(pitch)
        roll_predicted = F.softmax(roll)
        # Continuous predictions
        yaw_predicted = torch.sum(yaw_predicted.data * idx_tensor, 1)
        pitch_predicted = torch.sum(pitch_predicted.data * idx_tensor, 1)
        roll_predicted = torch.sum(roll_predicted.data * idx_tensor, 1)
        yaw_predicted = utils.softmax_temperature(yaw.data, 1)
        pitch_predicted = utils.softmax_temperature(pitch.data, 1)
        roll_predicted = utils.softmax_temperature(roll.data, 1)
        yaw_predicted = yaw_predicted.cpu()
        pitch_predicted = pitch_predicted.cpu()
        roll_predicted = roll_predicted.cpu()
        yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu()
        pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu()
        roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu()
        # Mean absolute error
        yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw) * 3)
code/test_resnet_shape.py
New file
@@ -0,0 +1,145 @@
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.backends.cudnn as cudnn
import torchvision
import torch.nn.functional as F
import cv2
import matplotlib.pyplot as plt
import sys
import os
import argparse
import datasets
import hopenet
import utils
def parse_args():
    """Parse input arguments."""
    parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
    parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
            default=0, type=int)
    parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
          default='', type=str)
    parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
          default='', type=str)
    parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.',
          default='', type=str)
    parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
          default=1, type=int)
    parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.',
          default=False, type=bool)
    args = parser.parse_args()
    return args
if __name__ == '__main__':
    args = parse_args()
    cudnn.enabled = True
    gpu = args.gpu_id
    snapshot_path = os.path.join('output/snapshots', args.snapshot + '.pkl')
    # ResNet101 with 3 outputs.
    # model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 23, 3], 66)
    # ResNet50
    model = hopenet.Hopenet_shape(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66, 60)
    # ResNet18
    # model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66)
    print 'Loading snapshot.'
    # Load snapshot
    saved_state_dict = torch.load(snapshot_path)
    model.load_state_dict(saved_state_dict)
    print 'Loading data.'
    transformations = transforms.Compose([transforms.Scale(224),
    transforms.RandomCrop(224), transforms.ToTensor()])
    pose_dataset = datasets.AFLW2000_binned(args.data_dir, args.filename_list,
                                transformations)
    test_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
                                               batch_size=args.batch_size,
                                               num_workers=2)
    model.cuda(gpu)
    print 'Ready to test network.'
    # Test the Model
    model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
    total = 0
    n_margins = 20
    yaw_correct = np.zeros(n_margins)
    pitch_correct = np.zeros(n_margins)
    roll_correct = np.zeros(n_margins)
    idx_tensor = [idx for idx in xrange(66)]
    idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
    yaw_error = .0
    pitch_error = .0
    roll_error = .0
    l1loss = torch.nn.L1Loss(size_average=False)
    for i, (images, labels, name) in enumerate(test_loader):
        images = Variable(images).cuda(gpu)
        total += labels.size(0)
        label_yaw = labels[:,0].float()
        label_pitch = labels[:,1].float()
        label_roll = labels[:,2].float()
        yaw, pitch, roll, shape = model(images)
        # Binned predictions
        _, yaw_bpred = torch.max(yaw.data, 1)
        _, pitch_bpred = torch.max(pitch.data, 1)
        _, roll_bpred = torch.max(roll.data, 1)
        # Continuous predictions
        yaw_predicted = utils.softmax_temperature(yaw.data, 1)
        pitch_predicted = utils.softmax_temperature(pitch.data, 1)
        roll_predicted = utils.softmax_temperature(roll.data, 1)
        yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu()
        pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu()
        roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu()
        # Mean absolute error
        yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw) * 3)
        pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch) * 3)
        roll_error += torch.sum(torch.abs(roll_predicted - label_roll) * 3)
        # Binned Accuracy
        # for er in xrange(n_margins):
        #     yaw_bpred[er] += (label_yaw[0] in range(yaw_bpred[0,0] - er, yaw_bpred[0,0] + er + 1))
        #     pitch_bpred[er] += (label_pitch[0] in range(pitch_bpred[0,0] - er, pitch_bpred[0,0] + er + 1))
        #     roll_bpred[er] += (label_roll[0] in range(roll_bpred[0,0] - er, roll_bpred[0,0] + er + 1))
        # print label_yaw[0], yaw_bpred[0,0]
        # Save images with pose cube.
        # TODO: fix for larger batch size
        if args.save_viz:
            name = name[0]
            cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg'))
            #print os.path.join('output/images', name + '.jpg')
            #print label_yaw[0] * 3 - 99, label_pitch[0] * 3 - 99, label_roll[0] * 3 - 99
            #print yaw_predicted * 3 - 99, pitch_predicted * 3 - 99, roll_predicted * 3 - 99
            utils.plot_pose_cube(cv2_img, yaw_predicted[0] * 3 - 99, pitch_predicted[0] * 3 - 99, roll_predicted[0] * 3 - 99)
            cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img)
    print('Test error in degrees of the model on the ' + str(total) +
    ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total,
    pitch_error / total, roll_error / total))
    # Binned accuracy
    # for idx in xrange(len(yaw_correct)):
    #     print yaw_correct[idx] / total, pitch_correct[idx] / total, roll_correct[idx] / total
code/train_resnet_shape.py
@@ -66,7 +66,17 @@
    b.append(model.fc_yaw)
    b.append(model.fc_pitch)
    b.append(model.fc_roll)
    b.append(model.fc_shape_0)
    b.append(model.fc_shape_1)
    b.append(model.fc_shape_2)
    b.append(model.fc_shape_3)
    b.append(model.fc_shape_4)
    b.append(model.fc_shape_5)
    b.append(model.fc_shape_6)
    b.append(model.fc_shape_7)
    b.append(model.fc_shape_8)
    b.append(model.fc_shape_9)
    for i in range(len(b)):
        for j in b[i].modules():
            for k in j.parameters():
@@ -96,7 +106,7 @@
    # ResNet101 with 3 outputs
    # model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 23, 3], 66)
    # ResNet50
    model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
    model = hopenet.Hopenet_shape(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66, 60)
    # ResNet18
    # model = hopenet.Hopenet(torchvision.models.resnet.BasicBlock, [2, 2, 2, 2], 66)
    load_filtered_state_dict(model, model_zoo.load_url(model_urls['resnet50']))
@@ -114,8 +124,8 @@
                                               num_workers=2)
    model.cuda(gpu)
    criterion = nn.CrossEntropyLoss()
    reg_criterion = nn.MSELoss()
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    reg_criterion = nn.MSELoss().cuda(gpu)
    # Regression loss coefficient
    alpha = 0.1
    lsm = nn.Softmax()
@@ -124,21 +134,23 @@
    idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
    optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': args.lr},
                                  {'params': get_non_ignored_params(model), 'lr': args.lr * 10}],
                                  {'params': get_non_ignored_params(model), 'lr': args.lr}],
                                  lr = args.lr)
    print 'Ready to train network.'
    for epoch in range(num_epochs):
        for i, (images, labels, name) in enumerate(train_loader):
            images = Variable(images).cuda(gpu)
            label_yaw = Variable(labels[:,0]).cuda(gpu)
            label_pitch = Variable(labels[:,1]).cuda(gpu)
            label_roll = Variable(labels[:,2]).cuda(gpu)
            label_shape_1 = Variable(labels[:,3]).cuda(gpu)
            images = Variable(images.cuda(gpu))
            label_yaw = Variable(labels[:,0].cuda(gpu))
            label_pitch = Variable(labels[:,1].cuda(gpu))
            label_roll = Variable(labels[:,2].cuda(gpu))
            label_shape = Variable(labels[:,3:].cuda(gpu))
            optimizer.zero_grad()
            yaw, pitch, roll, shape_1 = model(images)
            model.zero_grad()
            yaw, pitch, roll, shape = model(images)
            # Cross entropy loss
            loss_yaw = criterion(yaw, label_yaw)
@@ -158,17 +170,18 @@
            loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch.float())
            loss_reg_roll = reg_criterion(roll_predicted, label_roll.float())
            # Shape space loss
            loss_shape_1 = criterion(shape_1, label_shape_1)
            # Total loss
            loss_yaw += alpha * loss_reg_yaw
            loss_pitch += alpha * loss_reg_pitch
            loss_roll += alpha * loss_reg_roll
            loss_seq = [loss_yaw, loss_pitch, loss_roll, loss_shape_1]
            loss_seq = [loss_yaw, loss_pitch, loss_roll]
            # Shape space loss
            for idx in xrange(len(shape)):
                loss_seq.append(criterion(shape[idx], label_shape[:,idx]))
            grad_seq = [torch.Tensor(1).cuda(gpu) for _ in range(len(loss_seq))]
            model.zero_grad()
            torch.autograd.backward(loss_seq, grad_seq)
            optimizer.step()
@@ -176,17 +189,17 @@
            #        %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))
            if (i+1) % 100 == 0:
                print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f'
                       %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))
                print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f, Shape %.4f'
                       %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0], loss_seq[3].data[0]))
                if epoch == 0:
                    torch.save(model.state_dict(),
                    'output/snapshots/resnet50_iter_'+ str(i+1) + '.pkl')
                    'output/snapshots/resnet50_shape_iter_'+ str(i+1) + '.pkl')
        # Save models at numbered epochs.
        if epoch % 1 == 0 and epoch < num_epochs - 1:
            print 'Taking snapshot...'
            torch.save(model.state_dict(),
            'output/snapshots/resnet50_epoch_'+ str(epoch+1) + '.pkl')
            'output/snapshots/resnet50_shape_epoch_'+ str(epoch+1) + '.pkl')
    # Save the final Trained Model
    torch.save(model.state_dict(), 'output/snapshots/resnet50_epoch_' + str(epoch+1) + '.pkl')
    torch.save(model.state_dict(), 'output/snapshots/resnet50_shape_epoch_' + str(epoch+1) + '.pkl')
code/utils.py
@@ -7,6 +7,11 @@
import math
from math import cos, sin
def softmax_temperature(tensor, temperature):
    result = torch.exp(tensor / temperature)
    result = torch.div(result, torch.sum(result, 1).unsqueeze(1).expand_as(result))
    return result
def get_pose_params_from_mat(mat_path):
    # This functions gets the pose parameters from the .mat
    # Annotations that come with the 300W_LP dataset.
practice/aflw_example.py
New file
@@ -0,0 +1,133 @@
#!/usr/bin/env python
##
# Massimiliano Patacchiola, Plymouth University 2016
# website: http://mpatacchiola.github.io/
# email: massimiliano.patacchiola@plymouth.ac.uk
# Python code for information retrieval from the Annotated Facial Landmarks in the Wild (AFLW) dataset.
# In this example the faces are isolated and saved in a specified output folder.
# Some information (roll, pitch, yaw) are returned, they can be used to filter the images.
# This code requires OpenCV and Numpy. You can easily bypass the OpenCV calls if you want to use
# a different library. In order to use the code you have to unzip the images and store them in
# the directory "flickr" mantaining the original folders name (0, 2, 3).
#
# The following are the database properties available (last updated version 2012-11-28):
#
# databases: db_id, path, description
# faceellipse: face_id, x, y, ra, rb, theta, annot_type_id, upsidedown
# faceimages: image_id, db_id, file_id, filepath, bw, widht, height
# facemetadata: face_id, sex, occluded, glasses, bw, annot_type_id
# facepose: face_id, roll, pitch, yaw, annot_type_id
# facerect: face_id, x, y, w, h, annot_type_id
# faces: face_id, file_id, db_id
# featurecoords: face_id, feature_id, x, y
# featurecoordtype: feature_id, descr, code, x, y, z
import sqlite3
import cv2
import os.path
import numpy as np
#Change this paths according to your directories
images_path = "./flickr/"
storing_path = "./output/"
def main():
    #Image counter
    counter = 1
    #Open the sqlite database
    conn = sqlite3.connect('aflw.sqlite')
    c = conn.cursor()
    #Creating the query string for retriving: roll, pitch, yaw and faces position
    #Change it according to what you want to retrieve
    select_string = "faceimages.filepath, faces.face_id, facepose.roll, facepose.pitch, facepose.yaw, facerect.x, facerect.y, facerect.w, facerect.h"
    from_string = "faceimages, faces, facepose, facerect"
    where_string = "faces.face_id = facepose.face_id and faces.file_id = faceimages.file_id and faces.face_id = facerect.face_id"
    query_string = "SELECT " + select_string + " FROM " + from_string + " WHERE " + where_string
    #It iterates through the rows returned from the query
    for row in c.execute(query_string):
        #Using our specific query_string, the "row" variable will contain:
        # row[0] = image path
        # row[1] = face id
        # row[2] = roll
        # row[3] = pitch
        # row[4] = yaw
        # row[5] = face coord x
        # row[6] = face coord y
        # row[7] = face width
        # row[8] = face heigh
        #Creating the full path names for input and output
        input_path = images_path + str(row[0])
        output_path = storing_path + str(row[0])
        #If the file exist then open it
        if(os.path.isfile(input_path)  == True):
            #image = cv2.imread(input_path, 0) #load in grayscale
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #load the colour version
            #Image dimensions
            image_h, image_w = image.shape
            #Roll, pitch and yaw
            roll   = row[2]
            pitch  = row[3]
            yaw    = row[4]
            #Face rectangle coords
            face_x = row[5]
            face_y = row[6]
            face_w = row[7]
            face_h = row[8]
            #Error correction
            if(face_x < 0): face_x = 0
            if(face_y < 0): face_y = 0
            if(face_w > image_w):
                face_w = image_w
                face_h = image_w
            if(face_h > image_h):
                face_h = image_h
                face_w = image_h
            #Crop the face from the image
            image_cropped = np.copy(image[face_y:face_y+face_h, face_x:face_x+face_w])
            #Uncomment the lines below if you want to rescale the image to a particular size
            #to_size = 64
            #image_rescaled = cv2.resize(image_cropped, (to_size,to_size), interpolation = cv2.INTER_AREA)
            #Uncomment the line below if you want to use adaptive histogram normalisation
            #clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(5,5))
            #image_normalised = clahe.apply(image_rescaled)
            #Save the image
            #change "image_cropped" with the last uncommented variable name above
            cv2.imwrite(output_path, image_cropped)
            #Printing the information
            print "Counter: " + str(counter)
            print "iPath:    " + input_path
            print "oPath:    " + output_path
            print "Roll:    " + str(roll)
            print "Pitch:   " + str(pitch)
            print "Yaw:     " + str(yaw)
            print "x:       " + str(face_x)
            print "y:       " + str(face_y)
            print "w:       " + str(face_w)
            print "h:       " + str(face_h)
            print ""
            #Increasing the counter
            counter = counter + 1
        #if the file does not exits it return an exception
        else:
            raise ValueError('Error: I cannot find the file specified: ' + str(input_path))
    #Once finished the iteration it closes the database
    c.close()
if __name__ == "__main__":
    main()