From 168af40fe9a3cc81c6ee16b3e81f154780c36bdb Mon Sep 17 00:00:00 2001 From: Scheaven <xuepengqiang> Date: 星期四, 03 六月 2021 15:03:27 +0800 Subject: [PATCH] up new v4 --- lib/detecter_tools/darknet/softmax_layer.c | 747 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 files changed, 622 insertions(+), 125 deletions(-) diff --git a/lib/detecter_tools/darknet/softmax_layer.c b/lib/detecter_tools/darknet/softmax_layer.c index 61448aa..59f5111 100644 --- a/lib/detecter_tools/darknet/softmax_layer.c +++ b/lib/detecter_tools/darknet/softmax_layer.c @@ -1,125 +1,622 @@ -#include "softmax_layer.h" -#include "blas.h" -#include "dark_cuda.h" -#include "utils.h" -#include "blas.h" - -#include <float.h> -#include <math.h> -#include <stdlib.h> -#include <stdio.h> -#include <assert.h> - -#define SECRET_NUM -1234 - -void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output) -{ - int b; - for (b = 0; b < batch; ++b) { - int i; - int count = 0; - for (i = 0; i < hierarchy->groups; ++i) { - int group_size = hierarchy->group_size[i]; - softmax(input + b*inputs + count, group_size, temp, output + b*inputs + count, 1); - count += group_size; - } - } -} - -softmax_layer make_softmax_layer(int batch, int inputs, int groups) -{ - assert(inputs%groups == 0); - fprintf(stderr, "softmax %4d\n", inputs); - softmax_layer l = { (LAYER_TYPE)0 }; - l.type = SOFTMAX; - l.batch = batch; - l.groups = groups; - l.inputs = inputs; - l.outputs = inputs; - l.loss = (float*)xcalloc(inputs * batch, sizeof(float)); - l.output = (float*)xcalloc(inputs * batch, sizeof(float)); - l.delta = (float*)xcalloc(inputs * batch, sizeof(float)); - l.cost = (float*)xcalloc(1, sizeof(float)); - - l.forward = forward_softmax_layer; - l.backward = backward_softmax_layer; - #ifdef GPU - l.forward_gpu = forward_softmax_layer_gpu; - l.backward_gpu = backward_softmax_layer_gpu; - - l.output_gpu = cuda_make_array(l.output, inputs*batch); - l.loss_gpu = cuda_make_array(l.loss, inputs*batch); - l.delta_gpu = cuda_make_array(l.delta, inputs*batch); - #endif - return l; -} - -void forward_softmax_layer(const softmax_layer l, network_state net) -{ - if(l.softmax_tree){ - int i; - int count = 0; - for (i = 0; i < l.softmax_tree->groups; ++i) { - int group_size = l.softmax_tree->group_size[i]; - softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count); - count += group_size; - } - } else { - softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output); - } - - if(net.truth && !l.noloss){ - softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss); - l.cost[0] = sum_array(l.loss, l.batch*l.inputs); - } -} - -void backward_softmax_layer(const softmax_layer l, network_state net) -{ - axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1); -} - -#ifdef GPU - -void pull_softmax_layer_output(const softmax_layer layer) -{ - cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); -} - -void forward_softmax_layer_gpu(const softmax_layer l, network_state net) -{ - if(l.softmax_tree){ - softmax_tree_gpu(net.input, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree); - /* - int i; - int count = 0; - for (i = 0; i < l.softmax_tree->groups; ++i) { - int group_size = l.softmax_tree->group_size[i]; - softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count); - count += group_size; - } - */ - } else { - if(l.spatial){ - softmax_gpu_new_api(net.input, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu); - }else{ - softmax_gpu_new_api(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu); - } - } - if(net.truth && !l.noloss){ - softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth, l.delta_gpu, l.loss_gpu); - if(l.softmax_tree){ - mask_gpu_new_api(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth, 0); - mask_gpu_new_api(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth, 0); - } - cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs); - l.cost[0] = sum_array(l.loss, l.batch*l.inputs); - } -} - -void backward_softmax_layer_gpu(const softmax_layer layer, network_state net) -{ - axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta, 1); -} - -#endif +#include "softmax_layer.h" +#include "blas.h" +#include "dark_cuda.h" +#include "utils.h" +#include "blas.h" + +#include <float.h> +#include <math.h> +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> + +#define SECRET_NUM -1234 + +void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output) +{ + int b; + for (b = 0; b < batch; ++b) { + int i; + int count = 0; + for (i = 0; i < hierarchy->groups; ++i) { + int group_size = hierarchy->group_size[i]; + softmax(input + b*inputs + count, group_size, temp, output + b*inputs + count, 1); + count += group_size; + } + } +} + +softmax_layer make_softmax_layer(int batch, int inputs, int groups) +{ + assert(inputs%groups == 0); + fprintf(stderr, "softmax %4d\n", inputs); + softmax_layer l = { (LAYER_TYPE)0 }; + l.type = SOFTMAX; + l.batch = batch; + l.groups = groups; + l.inputs = inputs; + l.outputs = inputs; + l.loss = (float*)xcalloc(inputs * batch, sizeof(float)); + l.output = (float*)xcalloc(inputs * batch, sizeof(float)); + l.delta = (float*)xcalloc(inputs * batch, sizeof(float)); + l.cost = (float*)xcalloc(1, sizeof(float)); + + l.forward = forward_softmax_layer; + l.backward = backward_softmax_layer; +#ifdef GPU + l.forward_gpu = forward_softmax_layer_gpu; + l.backward_gpu = backward_softmax_layer_gpu; + + l.output_gpu = cuda_make_array(l.output, inputs*batch); + l.loss_gpu = cuda_make_array(l.loss, inputs*batch); + l.delta_gpu = cuda_make_array(l.delta, inputs*batch); +#endif + return l; +} + +void forward_softmax_layer(const softmax_layer l, network_state net) +{ + if(l.softmax_tree){ + int i; + int count = 0; + for (i = 0; i < l.softmax_tree->groups; ++i) { + int group_size = l.softmax_tree->group_size[i]; + softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count); + count += group_size; + } + } else { + softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output); + } + + if(net.truth && !l.noloss){ + softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss); + l.cost[0] = sum_array(l.loss, l.batch*l.inputs); + } +} + +void backward_softmax_layer(const softmax_layer l, network_state net) +{ + axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1); +} + +#ifdef GPU + +void pull_softmax_layer_output(const softmax_layer layer) +{ + cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); +} + +void forward_softmax_layer_gpu(const softmax_layer l, network_state net) +{ + if(l.softmax_tree){ + softmax_tree_gpu(net.input, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree); + /* + int i; + int count = 0; + for (i = 0; i < l.softmax_tree->groups; ++i) { + int group_size = l.softmax_tree->group_size[i]; + softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count); + count += group_size; + } + */ + } else { + if(l.spatial){ + softmax_gpu_new_api(net.input, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu); + }else{ + softmax_gpu_new_api(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu); + } + } + if(net.truth && !l.noloss){ + softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth, l.delta_gpu, l.loss_gpu); + if(l.softmax_tree){ + mask_gpu_new_api(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth, 0); + mask_gpu_new_api(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth, 0); + } + cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs); + l.cost[0] = sum_array(l.loss, l.batch*l.inputs); + } +} + +void backward_softmax_layer_gpu(const softmax_layer layer, network_state state) +{ + axpy_ongpu(layer.batch*layer.inputs, state.net.loss_scale, layer.delta_gpu, 1, state.delta, 1); +} + +#endif + +// ------------------------------------- + +// Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf +contrastive_layer make_contrastive_layer(int batch, int w, int h, int c, int classes, int inputs, layer *yolo_layer) +{ + contrastive_layer l = { (LAYER_TYPE)0 }; + l.type = CONTRASTIVE; + l.batch = batch; + l.inputs = inputs; + l.w = w; + l.h = h; + l.c = c; + l.temperature = 1; + + l.max_boxes = 0; + if (yolo_layer) { + l.detection = 1; + l.max_boxes = yolo_layer->max_boxes; + l.labels = yolo_layer->labels; // track id + l.class_ids = yolo_layer->class_ids; // class_ids + l.n = yolo_layer->n; // num of embeddings per cell = num of anchors + l.classes = yolo_layer->classes;// num of classes + classes = l.classes; + l.embedding_size = l.inputs / (l.n*l.h*l.w); + l.truths = yolo_layer->truths; + if (l.embedding_size != yolo_layer->embedding_size) { + printf(" Error: [contrastive] embedding_size=%d isn't equal to [yolo] embedding_size=%d. They should use the same [convolutional] layer \n", l.embedding_size, yolo_layer->embedding_size); + getchar(); + exit(0); + } + if (l.inputs % (l.n*l.h*l.w) != 0) { + printf(" Warning: filters= number in the previous (embedding) layer isn't divisable by number of anchors %d \n", l.n); + getchar(); + } + } + else { + l.detection = 0; + l.labels = (int*)xcalloc(l.batch, sizeof(int)); // labels + l.n = 1; // num of embeddings per cell + l.classes = classes; // num of classes + l.embedding_size = l.c; + } + l.outputs = inputs; + + l.loss = (float*)xcalloc(1, sizeof(float)); + l.output = (float*)xcalloc(inputs * batch, sizeof(float)); + l.delta = (float*)xcalloc(inputs * batch, sizeof(float)); + l.cost = (float*)xcalloc(1, sizeof(float)); + + const size_t step = l.batch*l.n*l.h*l.w; + l.cos_sim = NULL; + l.exp_cos_sim = NULL; + l.p_constrastive = NULL; + if (!l.detection) { + l.cos_sim = (float*)xcalloc(step*step, sizeof(float)); + l.exp_cos_sim = (float*)xcalloc(step*step, sizeof(float)); + l.p_constrastive = (float*)xcalloc(step*step, sizeof(float)); + } + //l.p_constrastive = (float*)xcalloc(step*step, sizeof(float)); + //l.contrast_p_size = (int*)xcalloc(1, sizeof(int)); + //*l.contrast_p_size = step; + //l.contrast_p = (contrastive_params*)xcalloc(*l.contrast_p_size, sizeof(contrastive_params)); + + l.forward = forward_contrastive_layer; + l.backward = backward_contrastive_layer; +#ifdef GPU + l.forward_gpu = forward_contrastive_layer_gpu; + l.backward_gpu = backward_contrastive_layer_gpu; + + l.output_gpu = cuda_make_array(l.output, inputs*batch); + l.delta_gpu = cuda_make_array(l.delta, inputs*batch); + + const int max_contr_size = (l.max_boxes*l.batch)*(l.max_boxes*l.batch) * sizeof(contrastive_params)/4; + printf(" max_contr_size = %d MB \n", max_contr_size / (1024*1024)); + l.contrast_p_gpu = (contrastive_params *)cuda_make_array(NULL, max_contr_size); +#endif + fprintf(stderr, "contrastive %4d x%4d x%4d x emb_size %4d x batch: %4d classes = %4d, step = %4d \n", w, h, l.n, l.embedding_size, batch, l.classes, step); + if(l.detection) fprintf(stderr, "detection \n"); + return l; +} + +static inline float clip_value(float val, const float max_val) +{ + if (val > max_val) { + //printf("\n val = %f > max_val = %f \n", val, max_val); + val = max_val; + } + else if (val < -max_val) { + //printf("\n val = %f < -max_val = %f \n", val, -max_val); + val = -max_val; + } + return val; +} + +void forward_contrastive_layer(contrastive_layer l, network_state state) +{ + if (!state.train) return; + const float truth_thresh = state.net.label_smooth_eps; + + const int mini_batch = l.batch / l.steps; + + int b, n, w, h; + fill_cpu(l.batch*l.inputs, 0, l.delta, 1); + + if (!l.detection) { + + for (b = 0; b < l.batch; ++b) { + if (state.net.adversarial) l.labels[b] = b % 2; + else l.labels[b] = b / 2; + } + + // set labels + for (b = 0; b < l.batch; ++b) { + for (h = 0; h < l.h; ++h) { + for (w = 0; w < l.w; ++w) + { + // find truth with max prob (only 1 label even if mosaic is used) + float max_truth = 0; + int n; + for (n = 0; n < l.classes; ++n) { + const float truth_prob = state.truth[b*l.classes + n]; + //printf(" truth_prob = %f, ", truth_prob); + //if (truth_prob > max_truth) + if (truth_prob > truth_thresh) + { + //printf(" truth_prob = %f, max_truth = %f, n = %d; ", truth_prob, max_truth, n); + max_truth = truth_prob; + l.labels[b] = n; + } + } + //printf(", l.labels[b] = %d ", l.labels[b]); + } + } + } + + } + //printf("\n\n"); + + // set pointers to features + float **z = (float**)xcalloc(l.batch*l.n*l.h*l.w, sizeof(float*)); + + for (b = 0; b < l.batch; ++b) { + for (n = 0; n < l.n; ++n) { + for (h = 0; h < l.h; ++h) { + for (w = 0; w < l.w; ++w) + { + const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w; + if (l.labels[z_index] < 0) continue; + + //const int input_index = b*l.inputs + n*l.embedding_size*l.h*l.w + h*l.w + w; + //float *ptr = state.input + input_index; + //z[z_index] = ptr; + + z[z_index] = (float*)xcalloc(l.embedding_size, sizeof(float)); + get_embedding(state.input, l.w, l.h, l.c, l.embedding_size, w, h, n, b, z[z_index]); + } + } + } + } + + int b2, n2, h2, w2; + int contrast_p_index = 0; + + const size_t step = l.batch*l.n*l.h*l.w; + size_t contrast_p_size = step; + if (!l.detection) contrast_p_size = l.batch*l.batch; + contrastive_params *contrast_p = (contrastive_params*)xcalloc(contrast_p_size, sizeof(contrastive_params)); + + float *max_sim_same = (float *)xcalloc(l.batch*l.inputs, sizeof(float)); + float *max_sim_diff = (float *)xcalloc(l.batch*l.inputs, sizeof(float)); + fill_cpu(l.batch*l.inputs, -10, max_sim_same, 1); + fill_cpu(l.batch*l.inputs, -10, max_sim_diff, 1); + + // precalculate cosine similiraty + for (b = 0; b < l.batch; ++b) { + for (n = 0; n < l.n; ++n) { + for (h = 0; h < l.h; ++h) { + for (w = 0; w < l.w; ++w) + { + const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w; + if (l.labels[z_index] < 0) continue; + + for (b2 = 0; b2 < l.batch; ++b2) { + for (n2 = 0; n2 < l.n; ++n2) { + for (h2 = 0; h2 < l.h; ++h2) { + for (w2 = 0; w2 < l.w; ++w2) + { + const int z_index2 = b2*l.n*l.h*l.w + n2*l.h*l.w + h2*l.w + w2; + if (l.labels[z_index2] < 0) continue; + if (z_index == z_index2) continue; + if (l.detection) + if (l.class_ids[z_index] != l.class_ids[z_index2]) continue; + + const int time_step_i = b / mini_batch; + const int time_step_j = b2 / mini_batch; + if (time_step_i != time_step_j) continue; + + const size_t step = l.batch*l.n*l.h*l.w; + + const float sim = cosine_similarity(z[z_index], z[z_index2], l.embedding_size); + const float exp_sim = expf(sim / l.temperature); + if (!l.detection) { + l.cos_sim[z_index*step + z_index2] = sim; + l.exp_cos_sim[z_index*step + z_index2] = exp_sim; + } + + // calc good sim + if (l.labels[z_index] == l.labels[z_index2] && max_sim_same[z_index] < sim) max_sim_same[z_index] = sim; + if (l.labels[z_index] != l.labels[z_index2] && max_sim_diff[z_index] < sim) max_sim_diff[z_index] = sim; + //printf(" z_i = %d, z_i2 = %d, l = %d, l2 = %d, sim = %f \n", z_index, z_index2, l.labels[z_index], l.labels[z_index2], sim); + + contrast_p[contrast_p_index].sim = sim; + contrast_p[contrast_p_index].exp_sim = exp_sim; + contrast_p[contrast_p_index].i = z_index; + contrast_p[contrast_p_index].j = z_index2; + contrast_p[contrast_p_index].time_step_i = time_step_i; + contrast_p[contrast_p_index].time_step_j = time_step_j; + contrast_p_index++; + //printf(" contrast_p_index = %d, contrast_p_size = %d \n", contrast_p_index, contrast_p_size); + if ((contrast_p_index+1) >= contrast_p_size) { + contrast_p_size = contrast_p_index + 1; + //printf(" contrast_p_size = %d, z_index = %d, z_index2 = %d \n", contrast_p_size, z_index, z_index2); + contrast_p = (contrastive_params*)xrealloc(contrast_p, contrast_p_size * sizeof(contrastive_params)); + } + + if (sim > 1.001 || sim < -1.001) { + printf(" sim = %f, ", sim); getchar(); + } + } + } + } + } + } + } + } + } + + // calc contrastive accuracy + int i; + int good_sims = 0, all_sims = 0, same_sim = 0, diff_sim = 0; + for (i = 0; i < l.batch*l.inputs; ++i) { + if (max_sim_same[i] >= -1 && max_sim_diff[i] >= -1) { + if (max_sim_same[i] >= -1) same_sim++; + if (max_sim_diff[i] >= -1) diff_sim++; + ++all_sims; + //printf(" max_sim_diff[i] = %f, max_sim_same[i] = %f \n", max_sim_diff[i], max_sim_same[i]); + if (max_sim_diff[i] < max_sim_same[i]) good_sims++; + } + } + if (all_sims > 0) { + *l.loss = 100 * good_sims / all_sims; + } + else *l.loss = -1; + printf(" Contrast accuracy = %f %%, all = %d, good = %d, same = %d, diff = %d \n", *l.loss, all_sims, good_sims, same_sim, diff_sim); + free(max_sim_same); + free(max_sim_diff); + + + /* + // show near sim + float good_contrast = 0; + for (b = 0; b < l.batch; b += 2) { + float same = l.cos_sim[b*l.batch + b]; + float aug = l.cos_sim[b*l.batch + b + 1]; + float diff = l.cos_sim[b*l.batch + b + 2]; + good_contrast += (aug > diff); + //printf(" l.labels[b] = %d, l.labels[b+1] = %d, l.labels[b+2] = %d, b = %d \n", l.labels[b], l.labels[b + 1], l.labels[b + 2], b); + //printf(" same = %f, aug = %f, diff = %f, (aug > diff) = %d \n", same, aug, diff, (aug > diff)); + } + *l.loss = 100 * good_contrast / (l.batch / 2); + printf(" Contrast accuracy = %f %% \n", *l.loss); + */ + + /* + // precalculate P_contrastive + for (b = 0; b < l.batch; ++b) { + int b2; + for (b2 = 0; b2 < l.batch; ++b2) { + if (b != b2) { + const float P = P_constrastive(b, b2, l.labels, l.batch, z, l.embedding_size, l.temperature, l.cos_sim); + l.p_constrastive[b*l.batch + b2] = P; + if (P > 1 || P < -1) { + printf(" p = %f, ", P); getchar(); + } + } + } + } + */ + + + const size_t contr_size = contrast_p_index; + + if (l.detection) { +#ifdef GPU + const int max_contr_size = (l.max_boxes*l.batch)*(l.max_boxes*l.batch); + if (max_contr_size < contr_size) { + printf(" Error: too large number of bboxes: contr_size = %d > max_contr_size = %d \n", contr_size, max_contr_size); + exit(0); + } + int *labels = NULL; + if (contr_size > 2) { + cuda_push_array((float *)l.contrast_p_gpu, (float *)contrast_p, contr_size * sizeof(contrastive_params) / 4); + P_constrastive_f_det_gpu(labels, l.embedding_size, l.temperature, l.contrast_p_gpu, contr_size); + cuda_pull_array((float *)l.contrast_p_gpu, (float *)contrast_p, contr_size * sizeof(contrastive_params) / 4); + } +#else // GPU + int k; + //#pragma omp parallel for + for (k = 0; k < contr_size; ++k) { + contrast_p[k].P = P_constrastive_f_det(k, l.labels, z, l.embedding_size, l.temperature, contrast_p, contr_size); + } +#endif // GPU + } + else { + // precalculate P-contrastive + for (b = 0; b < l.batch; ++b) { + for (n = 0; n < l.n; ++n) { + for (h = 0; h < l.h; ++h) { + for (w = 0; w < l.w; ++w) + { + const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w; + if (l.labels[z_index] < 0) continue; + + for (b2 = 0; b2 < l.batch; ++b2) { + for (n2 = 0; n2 < l.n; ++n2) { + for (h2 = 0; h2 < l.h; ++h2) { + for (w2 = 0; w2 < l.w; ++w2) + { + const int z_index2 = b2*l.n*l.h*l.w + n2*l.h*l.w + h2*l.w + w2; + if (l.labels[z_index2] < 0) continue; + if (z_index == z_index2) continue; + if (l.detection) + if (l.class_ids[z_index] != l.class_ids[z_index2]) continue; + + const int time_step_i = b / mini_batch; + const int time_step_j = b2 / mini_batch; + if (time_step_i != time_step_j) continue; + + const size_t step = l.batch*l.n*l.h*l.w; + + float P = -10; + if (l.detection) { + P = P_constrastive_f(z_index, z_index2, l.labels, z, l.embedding_size, l.temperature, contrast_p, contr_size); + } + else { + P = P_constrastive(z_index, z_index2, l.labels, step, z, l.embedding_size, l.temperature, l.cos_sim, l.exp_cos_sim); + l.p_constrastive[z_index*step + z_index2] = P; + } + + int q; + for (q = 0; q < contr_size; ++q) + if (contrast_p[q].i == z_index && contrast_p[q].j == z_index2) { + contrast_p[q].P = P; + break; + } + + //if (q == contr_size) getchar(); + + + //if (P > 1 || P < -1) { + // printf(" p = %f, z_index = %d, z_index2 = %d ", P, z_index, z_index2); getchar(); + //} + } + } + } + } + } + } + } + } + } + + + // calc deltas + #pragma omp parallel for + for (b = 0; b < l.batch; ++b) { + for (n = 0; n < l.n; ++n) { + for (h = 0; h < l.h; ++h) { + for (w = 0; w < l.w; ++w) + { + const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w; + const size_t step = l.batch*l.n*l.h*l.w; + if (l.labels[z_index] < 0) continue; + + const int delta_index = b*l.embedding_size*l.n*l.h*l.w + n*l.embedding_size*l.h*l.w + h*l.w + w; + const int wh = l.w*l.h; + + if (l.detection) { + // detector + + // positive + grad_contrastive_loss_positive_f(z_index, l.class_ids, l.labels, step, z, l.embedding_size, l.temperature, l.delta + delta_index, wh, contrast_p, contr_size); + + // negative + grad_contrastive_loss_negative_f(z_index, l.class_ids, l.labels, step, z, l.embedding_size, l.temperature, l.delta + delta_index, wh, contrast_p, contr_size, l.contrastive_neg_max); + } + else { + // classifier + + // positive + grad_contrastive_loss_positive(z_index, l.labels, step, z, l.embedding_size, l.temperature, l.cos_sim, l.p_constrastive, l.delta + delta_index, wh); + + // negative + grad_contrastive_loss_negative(z_index, l.labels, step, z, l.embedding_size, l.temperature, l.cos_sim, l.p_constrastive, l.delta + delta_index, wh); + } + + } + } + } + } + + scal_cpu(l.inputs * l.batch, l.cls_normalizer, l.delta, 1); + + for (i = 0; i < l.inputs * l.batch; ++i) { + l.delta[i] = clip_value(l.delta[i], l.max_delta); + } + + *(l.cost) = pow(mag_array(l.delta, l.inputs * l.batch), 2); + if (state.net.adversarial) { + printf(" adversarial contrastive loss = %f \n\n", *(l.cost)); + } + else { + printf(" contrastive loss = %f \n\n", *(l.cost)); + } + + for (b = 0; b < l.batch; ++b) { + for (n = 0; n < l.n; ++n) { + for (h = 0; h < l.h; ++h) { + for (w = 0; w < l.w; ++w) + { + const int z_index = b*l.n*l.h*l.w + n*l.h*l.w + h*l.w + w; + //if (l.labels[z_index] < 0) continue; + if (z[z_index]) free(z[z_index]); + } + } + } + } + + free(contrast_p); + free(z); +} + +void backward_contrastive_layer(contrastive_layer l, network_state state) +{ + axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1); +} + + +#ifdef GPU + +void pull_contrastive_layer_output(const contrastive_layer l) +{ + cuda_pull_array(l.output_gpu, l.output, l.inputs*l.batch); +} + +void push_contrastive_layer_output(const contrastive_layer l) +{ + cuda_push_array(l.delta_gpu, l.delta, l.inputs*l.batch); +} + + +void forward_contrastive_layer_gpu(contrastive_layer l, network_state state) +{ + simple_copy_ongpu(l.batch*l.inputs, state.input, l.output_gpu); + if (!state.train) return; + + float *in_cpu = (float *)xcalloc(l.batch*l.inputs, sizeof(float)); + cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); + memcpy(in_cpu, l.output, l.batch*l.outputs * sizeof(float)); + float *truth_cpu = 0; + if (state.truth) { + int num_truth = l.batch*l.classes; + if (l.detection) num_truth = l.batch*l.truths; + truth_cpu = (float *)xcalloc(num_truth, sizeof(float)); + cuda_pull_array(state.truth, truth_cpu, num_truth); + } + network_state cpu_state = state; + cpu_state.net = state.net; + cpu_state.index = state.index; + cpu_state.train = state.train; + cpu_state.truth = truth_cpu; + cpu_state.input = in_cpu; + + forward_contrastive_layer(l, cpu_state); + cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); + + free(in_cpu); + if (cpu_state.truth) free(cpu_state.truth); +} + +void backward_contrastive_layer_gpu(contrastive_layer layer, network_state state) +{ + axpy_ongpu(layer.batch*layer.inputs, state.net.loss_scale, layer.delta_gpu, 1, state.delta, 1); +} + +#endif \ No newline at end of file -- Gitblit v1.8.0