123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448 |
- /*
- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /* ----------------------------------------------------------------------
- * Project: CMSIS NN Library
- * Title: arm_pool_q7_HWC.c
- * Description: Pooling function implementations
- *
- * $Date: 17. January 2018
- * $Revision: V.1.0.0
- *
- * Target Processor: Cortex-M cores
- *
- * -------------------------------------------------------------------- */
- #include "arm_math.h"
- #include "arm_nnfunctions.h"
- #if defined (ARM_MATH_DSP)
- /**
- * @brief A few utility functions used by pooling functions
- *
- *
- */
- static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
- {
- int i;
- for (i = 0; i < length; i++)
- {
- target[i] = (q7_t) (buffer[i] / scale);
- }
- }
- static void compare_and_replace_if_larger_q7(q7_t * base, // base data
- q7_t * target, // compare target
- const uint16_t length // data length
- )
- {
- q7_t *pIn = base;
- q7_t *pCom = target;
- union arm_nnword in;
- union arm_nnword com;
- uint16_t cnt = length >> 2;
- while (cnt > 0u)
- {
- in.word = *__SIMD32(pIn);
- com.word = *__SIMD32(pCom)++;
- // if version
- if (com.bytes[0] > in.bytes[0])
- in.bytes[0] = com.bytes[0];
- if (com.bytes[1] > in.bytes[1])
- in.bytes[1] = com.bytes[1];
- if (com.bytes[2] > in.bytes[2])
- in.bytes[2] = com.bytes[2];
- if (com.bytes[3] > in.bytes[3])
- in.bytes[3] = com.bytes[3];
- *__SIMD32(pIn)++ = in.word;
- cnt--;
- }
- }
- static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
- {
- q15_t *pCnt = base;
- q7_t *pV = target;
- q31_t v1, v2, vo1, vo2;
- uint16_t cnt = length >> 2;
- q31_t in;
- while (cnt > 0u)
- {
- q31_t value = *__SIMD32(pV)++;
- v1 = __SXTB16(__ROR(value, 8));
- v2 = __SXTB16(value);
- #ifndef ARM_MATH_BIG_ENDIAN
- vo2 = __PKHTB(v1, v2, 16);
- vo1 = __PKHBT(v2, v1, 16);
- #else
- vo1 = __PKHTB(v1, v2, 16);
- vo2 = __PKHBT(v2, v1, 16);
- #endif
- in = *__SIMD32(pCnt);
- *__SIMD32(pCnt)++ = __QADD16(vo1, in);
- in = *__SIMD32(pCnt);
- *__SIMD32(pCnt)++ = __QADD16(vo2, in);
- cnt--;
- }
- cnt = length & 0x3;
- while (cnt > 0u)
- {
- *pCnt++ += *pV++;
- cnt--;
- }
- }
- #endif // ARM_MATH_DSP
- /**
- * @ingroup groupNN
- */
- /**
- * @addtogroup Pooling
- * @{
- */
- /**
- * @brief Q7 max pooling function
- * @param[in, out] Im_in pointer to input tensor
- * @param[in] dim_im_in input tensor dimention
- * @param[in] ch_im_in number of input tensor channels
- * @param[in] dim_kernel filter kernel size
- * @param[in] padding padding sizes
- * @param[in] stride convolution stride
- * @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
- * @param[in,out] Im_out pointer to output tensor
- * @return none.
- *
- * @details
- *
- * <b>Buffer size:</b>
- *
- * bufferA size: 0
- *
- * The pooling function is implemented as split x-pooling then
- * y-pooling.
- *
- * This pooling function is input-destructive. Input data is undefined
- * after calling this function.
- *
- */
- void
- arm_maxpool_q7_HWC(q7_t * Im_in,
- const uint16_t dim_im_in,
- const uint16_t ch_im_in,
- const uint16_t dim_kernel,
- const uint16_t padding,
- const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
- {
- #if defined (ARM_MATH_DSP)
- /* Run the following code for Cortex-M4 and Cortex-M7 */
- int16_t i_x, i_y;
- /* first does the pooling along x axis */
- for (i_y = 0; i_y < dim_im_in; i_y++)
- {
- for (i_x = 0; i_x < dim_im_out; i_x++)
- {
- /* for each output pixel */
- q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
- q7_t *win_start;
- q7_t *win_stop;
- if (i_x * stride - padding < 0)
- {
- win_start = target;
- } else
- {
- win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
- }
- if (i_x * stride - padding + dim_kernel >= dim_im_in)
- {
- win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
- } else
- {
- win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
- }
- /* first step is to copy over initial data */
- /* arm_copy_q7(win_start, target, ch_im_in); */
- memmove(target, win_start, ch_im_in);
- /* start the max operation from the second part */
- win_start += ch_im_in;
- for (; win_start < win_stop; win_start += ch_im_in)
- {
- compare_and_replace_if_larger_q7(target, win_start, ch_im_in);
- }
- }
- }
- /* then does the pooling along y axis */
- for (i_y = 0; i_y < dim_im_out; i_y++)
- {
- /* for each output row */
- q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
- q7_t *row_start;
- q7_t *row_end;
- /* setting the starting row */
- if (i_y * stride - padding < 0)
- {
- row_start = Im_in;
- } else
- {
- row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
- }
- /* setting the stopping row */
- if (i_y * stride - padding + dim_kernel >= dim_im_in)
- {
- row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
- } else
- {
- row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
- }
- /* copy over the first row */
- /* arm_copy_q7(row_start, target, dim_im_out * ch_im_in); */
- memmove(target, row_start, dim_im_out * ch_im_in);
- /* move over to next row */
- row_start += ch_im_in * dim_im_in;
- for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
- {
- compare_and_replace_if_larger_q7(target, row_start, dim_im_out * ch_im_in);
- }
- }
- #else
- /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
- int16_t i_ch_in, i_x, i_y;
- int16_t k_x, k_y;
- for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
- {
- for (i_y = 0; i_y < dim_im_out; i_y++)
- {
- for (i_x = 0; i_x < dim_im_out; i_x++)
- {
- int max = -129;
- for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
- {
- for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
- {
- if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
- {
- if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
- {
- max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
- }
- }
- }
- }
- Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
- }
- }
- }
- #endif /* ARM_MATH_DSP */
- }
- /**
- * @brief Q7 average pooling function
- * @param[in,out] Im_in pointer to input tensor
- * @param[in] dim_im_in input tensor dimention
- * @param[in] ch_im_in number of input tensor channels
- * @param[in] dim_kernel filter kernel size
- * @param[in] padding padding sizes
- * @param[in] stride convolution stride
- * @param[in] dim_im_out output tensor dimension
- * @param[in,out] bufferA pointer to buffer space for input
- * @param[in,out] Im_out pointer to output tensor
- * @return none.
- *
- * @details
- *
- * <b>Buffer size:</b>
- *
- * bufferA size: 2*dim_im_out*ch_im_in
- *
- * The pooling function is implemented as split x-pooling then
- * y-pooling.
- *
- * This pooling function is input-destructive. Input data is undefined
- * after calling this function.
- *
- */
- void
- arm_avepool_q7_HWC(q7_t * Im_in,
- const uint16_t dim_im_in,
- const uint16_t ch_im_in,
- const uint16_t dim_kernel,
- const uint16_t padding,
- const uint16_t stride, const uint16_t dim_im_out, q7_t * bufferA, q7_t * Im_out)
- {
- #if defined (ARM_MATH_DSP)
- /* Run the following code for Cortex-M4 and Cortex-M7 */
- q15_t *buffer = (q15_t *) bufferA;
- int16_t i_x, i_y;
- int16_t count = 0;
- /* first does the pooling along x axis */
- for (i_y = 0; i_y < dim_im_in; i_y++)
- {
- for (i_x = 0; i_x < dim_im_out; i_x++)
- {
- /* for each output pixel */
- q7_t *target = Im_in + (i_y * dim_im_in + i_x) * ch_im_in;
- q7_t *win_start;
- q7_t *win_stop;
- if (i_x * stride - padding < 0)
- {
- win_start = target;
- } else
- {
- win_start = Im_in + (i_y * dim_im_in + i_x * stride - padding) * ch_im_in;
- }
- if (i_x * stride - padding + dim_kernel >= dim_im_in)
- {
- win_stop = Im_in + (i_y * dim_im_in + dim_im_in) * ch_im_in;
- } else
- {
- win_stop = Im_in + (i_y * dim_im_in + i_x * stride - padding + dim_kernel) * ch_im_in;
- }
- /* first step is to copy over initial data */
- arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
- count = 1;
- /* start the max operation from the second part */
- win_start += ch_im_in;
- for (; win_start < win_stop; win_start += ch_im_in)
- {
- accumulate_q7_to_q15(buffer, win_start, ch_im_in);
- count++;
- }
- buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
- }
- }
- /* then does the pooling along y axis */
- for (i_y = 0; i_y < dim_im_out; i_y++)
- {
- /* for each output row */
- q7_t *target = Im_out + i_y * dim_im_out * ch_im_in;
- q7_t *row_start;
- q7_t *row_end;
- /* setting the starting row */
- if (i_y * stride - padding < 0)
- {
- row_start = Im_in;
- } else
- {
- row_start = Im_in + (i_y * stride - padding) * dim_im_in * ch_im_in;
- }
- /* setting the stopping row */
- if (i_y * stride - padding + dim_kernel >= dim_im_in)
- {
- row_end = Im_in + dim_im_in * dim_im_in * ch_im_in;
- } else
- {
- row_end = Im_in + (i_y * stride - padding + dim_kernel) * dim_im_in * ch_im_in;
- }
- /* copy over the first row */
- arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out * ch_im_in);
- count = 1;
- /* move over to next row */
- row_start += ch_im_in * dim_im_in;
- for (; row_start < row_end; row_start += dim_im_in * ch_im_in)
- {
- accumulate_q7_to_q15(buffer, row_start, dim_im_out * ch_im_in);
- count++;
- }
- buffer_scale_back_q15_to_q7(buffer, target, dim_im_out * ch_im_in, count);
- }
- #else
- /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
- int16_t i_ch_in, i_x, i_y;
- int16_t k_x, k_y;
- for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
- {
- for (i_y = 0; i_y < dim_im_out; i_y++)
- {
- for (i_x = 0; i_x < dim_im_out; i_x++)
- {
- int sum = 0;
- int count = 0;
- for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
- {
- for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
- {
- if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
- {
- sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
- count++;
- }
- }
- }
- Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = sum / count;
- }
- }
- }
- #endif /* ARM_MATH_DSP */
- }
- /**
- * @} end of Pooling group
- */
|