arm_nnfunctions_modified.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. /*
  2. * Copyright (C) 2010-2022 Arm Limited or its affiliates.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * This file is MODIFIED from Arm CMSIS NN Library.
  20. *
  21. * Project: TinyEngine
  22. * Title: arm_nnfunctions_modified.h
  23. * Description: Public header file for TinyEngine.
  24. *
  25. * Reference papers:
  26. * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
  27. * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
  28. * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
  29. * Contact authors:
  30. * - Wei-Ming Chen, wmchen@mit.edu
  31. * - Wei-Chen Wang, wweichen@mit.edu
  32. * - Ji Lin, jilin@mit.edu
  33. * - Ligeng Zhu, ligeng@mit.edu
  34. * - Song Han, songhan@mit.edu
  35. *
  36. * Original Project: CMSIS NN Library
  37. * Original Title: arm_nnfunctions.h
  38. *
  39. * Target Processor: Cortex-M CPUs
  40. * -------------------------------------------------------------------- */
  41. /**
  42. \mainpage CMSIS NN Software Library
  43. *
  44. * Introduction
  45. * ------------
  46. *
  47. * This user manual describes the CMSIS NN software library,
  48. * a collection of efficient neural network kernels developed to maximize the
  49. * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
  50. *
  51. * The library is divided into a number of functions each covering a specific category:
  52. * - Convolution Functions
  53. * - Activation Functions
  54. * - Fully-connected Layer Functions
  55. * - SVDF Layer Functions
  56. * - Pooling Functions
  57. * - Softmax Functions
  58. * - Basic math Functions
  59. *
  60. * The library has separate functions for operating on different weight and activation data
  61. * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
  62. * kernels are included in the function description. The implementation details are also
  63. * described in this paper [1].
  64. *
  65. * Function Classification
  66. * --------
  67. * The functions can be classified into two segments
  68. * - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
  69. * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
  70. *
  71. * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
  72. * The article in [2] describes in detail how to run a network using the legacy functions.
  73. *
  74. * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
  75. * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
  76. * a TensorFlow Lite model using optimized CMSIS-NN kernels.
  77. *
  78. * Block Diagram
  79. * --------
  80. * \image html CMSIS-NN-OVERVIEW.PNG
  81. *
  82. * Examples
  83. * --------
  84. *
  85. * The library ships with a number of examples which demonstrate how to use the library functions.
  86. *
  87. * Pre-processor Macros
  88. * ------------
  89. *
  90. * Each library project have different pre-processor macros.
  91. *
  92. * - ARM_MATH_DSP:
  93. *
  94. * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
  95. *
  96. * - ARM_MATH_MVEI:
  97. *
  98. * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
  99. * - ARM_MATH_AUTOVECTORIZE
  100. * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
  101. * assembly. It does not affect functions that use C or intrinsics.
  102. * - ARM_MATH_BIG_ENDIAN:
  103. *
  104. * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
  105. * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
  106. * little endian targets.
  107. *
  108. * - ARM_NN_TRUNCATE:
  109. *
  110. * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
  111. *
  112. *
  113. * Copyright Notice
  114. * ------------
  115. *
  116. * Copyright (C) 2010-2019 Arm Limited. All rights reserved.
  117. *
  118. * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
  119. *
  120. * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
  121. *
  122. https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
  123. * [3] https://www.tensorflow.org/lite/microcontrollers/library
  124. *
  125. * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
  126. */
  127. /**
  128. * @defgroup groupNN Neural Network Functions
  129. * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
  130. * TensorFlow Lite framework.
  131. */
  132. #ifndef _ARM_NNFUNCTIONS_H
  133. #define _ARM_NNFUNCTIONS_H
  134. #include "hpm_math.h"
  135. #define USE_INTRINSIC
  136. //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
  137. #ifdef __cplusplus
  138. extern "C" {
  139. #endif
  140. /**
  141. * @defgroup NNConv Convolution Functions
  142. *
  143. * Collection of convolution, depthwise convolution functions and their variants.
  144. *
  145. * The convolution is implemented in 2 steps: im2col and GEMM
  146. *
  147. * im2col is a process of converting each patch of image data into
  148. * a column. After im2col, the convolution is computed as matrix-matrix
  149. * multiplication.
  150. *
  151. * To reduce the memory footprint, the im2col is performed partially.
  152. * Each iteration, only a few column (i.e., patches) are generated and
  153. * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
  154. *
  155. */
  156. char arm_convolve_s8_4col(const q7_t *input,
  157. const uint16_t input_x,
  158. const uint16_t input_y,
  159. const uint16_t input_ch,
  160. const uint16_t input_batches,
  161. const q7_t *kernel,
  162. const uint16_t output_ch,
  163. const uint16_t kernel_x,
  164. const uint16_t kernel_y,
  165. const uint16_t pad_x,
  166. const uint16_t pad_y,
  167. const uint16_t stride_x,
  168. const uint16_t stride_y,
  169. const int32_t *bias,
  170. q7_t *output,
  171. const int32_t *output_shift,
  172. const int32_t *output_mult,
  173. const int32_t out_offset,
  174. const int32_t input_offset,
  175. const int32_t out_activation_min,
  176. const int32_t out_activation_max,
  177. const uint16_t output_x,
  178. const uint16_t output_y,
  179. q15_t *buffer_a);
  180. q7_t *hpm_nn_mat_mult_kernel_s8_s16_reordered_oddch(const q7_t *input_a,
  181. const q15_t *input_b,
  182. const uint16_t output_ch,
  183. const int32_t *out_shift,
  184. const int32_t *out_mult,
  185. const int32_t out_offset,
  186. const int16_t activation_min,
  187. const int16_t activation_max,
  188. const uint16_t num_col_a,
  189. const int32_t *const output_bias,
  190. q7_t *out_0);
  191. q7_t *hpm_nn_mat_mult_kernel_s8_s16_reordered_8mul(const q7_t *input_a,
  192. const q15_t *input_b,
  193. const uint16_t output_ch,
  194. const int32_t *out_shift,
  195. const int32_t *out_mult,
  196. const int32_t out_offset,
  197. const int16_t activation_min,
  198. const int16_t activation_max,
  199. const uint16_t num_col_a,
  200. const int32_t *const output_bias,
  201. q7_t *out_0);
  202. q7_t *hpm_nn_mat_mult_kernel3_input3_s8_s16(const q7_t *input_a,
  203. const q15_t *input_b,
  204. const uint16_t output_ch,
  205. const int32_t *out_shift,
  206. const int32_t *out_mult,
  207. const int32_t out_offset,
  208. const int16_t activation_min,
  209. const int16_t activation_max,
  210. const uint16_t num_col_a,
  211. const int32_t *const output_bias,
  212. q7_t *out_0,
  213. q15_t *kbuf);
  214. #ifdef __cplusplus
  215. }
  216. #endif
  217. #endif