累加数组内起始元素到制定个数元素的累加和: uint32_t vector_add_of_n(uint32_t *ptr, uint32_t nitems) { uint32_t result, *i; uint32x2_t vec64a, vec64b; uint32x4_t vec128 = vdupq_n_u32(0); for (i = ptr; i < (ptr + nitems); i += 4) { uint32x4_t temp128 = vld1q_u32(i); vec128 = vaddq_u32(vec128, temp128); } vec64a = vget_low_u32(vec128); vec64b = vget_high_u32(vec128); vec64a = vadd_u32 (vec64a, vec64b); result = vget_lane_u32(vec64a, 0); result += vget_lane_u32(vec64a, 1); return result; } 两个数组相关映射元素的乘积和: void fir(short * y,const short *x, const short *h,int n_out, int n_coefs) { int n, k; int sum; int16x4_t h_vec; int16x4_t x_vec; int32x4_t result_vec; for (n = 0; n < n_out; n++) { sum = 0; result_vec = vdupq_n_s32(0); for(k = 0; k < n_coefs / 4; k++) { h_vec = vld1_s16(&h[k*4]); x_vec = vld1_s16(&x[n - n_coefs + 1 + k*4]); result_vec = vmlal_s16(result_vec, h_vec, x_vec); } sum += vgetq_lane_s32(result_vec, 0); sum += vgetq_lane_s32(result_vec, 1); sum += vgetq_lane_s32(result_vec, 2); sum += vgetq_lane_s32(result_vec, 3); if(n_coefs % 4) { for(k = n_coefs - (n_coefs % 4); k < n_coefs; k++) sum += h[k] * x[n - n_coefs + 1 + k]; } y[n] = ((sum>>15) + 1) >> 1; } } 彩色转灰度:(C) void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; for (i=0; i>8); } } void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; uint8x8_t rfac = vdup_n_u8 (77); uint8x8_t gfac = vdup_n_u8 (151); uint8x8_t bfac = vdup_n_u8 (28); n/=8; for (i=0; i