reid from https://github.com/michuanhaohao/reid-strong-baseline
zhangmeng
2020-01-17 f7c4a3cfd07adede3308f8d9d3d7315427d90a7c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#ifndef CAFFE2_UTILS_CPU_NEON_H_
#define CAFFE2_UTILS_CPU_NEON_H_
 
// Provides a variety of ARM NEON-specific utility functions
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
 
namespace caffe2 {
 
template <typename T>
inline bool isPointerAligned(T* p, size_t align) {
  return (reinterpret_cast<uintptr_t>(p) % align == 0);
}
 
inline float32x4_t vert_sum_f32(float32x4_t v0,
                                float32x4_t v1,
                                float32x4_t v2,
                                float32x4_t v3) {
  v0 = vaddq_f32(v0, v1);
  v2 = vaddq_f32(v2, v3);
  return vaddq_f32(v0, v2);
}
 
inline float horizontal_sum_f32(float32x4_t v0,
                                float32x4_t v1,
                                float32x4_t v2,
                                float32x4_t v3) {
  v0 = vert_sum_f32(v0, v1, v2, v3);
  float32x2_t v = vadd_f32(vget_high_f32(v0), vget_low_f32(v0));
  return vget_lane_f32(vpadd_f32(v, v), 0);
}
 
// Load/store functions that assume alignment
 
inline float32x4_t vld1q_f32_aligned(const float* p) {
  return vld1q_f32((const float*)
                   __builtin_assume_aligned(p, sizeof(float32x4_t)));
}
 
inline void vst1q_f32_aligned(float* p, float32x4_t v) {
  vst1q_f32((float*) __builtin_assume_aligned(p, sizeof(float32x4_t)), v);
}
 
inline void vst4_u8_aligned(uint8_t* p, uint8x8x4_t v) {
  vst4_u8((uint8_t*)
          __builtin_assume_aligned(p, sizeof(uint8x8x4_t)), v);
}
 
}  // namespace caffe2
 
#endif //  defined(__ARM_NEON__) || defined(__ARM_NEON)
 
#endif  // CAFFE2_UTILS_CPU_NEON_H_