| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /* ----------------------------------------------------------------------
- * This file is MODIFIED from Arm CMSIS NN Library.
- *
- * Project: TinyEngine
- * Title: arm_nnfunctions_modified.h
- * Description: Public header file for TinyEngine.
- *
- * Reference papers:
- * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
- * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
- * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
- * Contact authors:
- * - Wei-Ming Chen, wmchen@mit.edu
- * - Wei-Chen Wang, wweichen@mit.edu
- * - Ji Lin, jilin@mit.edu
- * - Ligeng Zhu, ligeng@mit.edu
- * - Song Han, songhan@mit.edu
- *
- * Original Project: CMSIS NN Library
- * Original Title: arm_nnfunctions.h
- *
- * Target Processor: Cortex-M CPUs
- * -------------------------------------------------------------------- */
- /**
- \mainpage CMSIS NN Software Library
- *
- * Introduction
- * ------------
- *
- * This user manual describes the CMSIS NN software library,
- * a collection of efficient neural network kernels developed to maximize the
- * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
- *
- * The library is divided into a number of functions each covering a specific category:
- * - Convolution Functions
- * - Activation Functions
- * - Fully-connected Layer Functions
- * - SVDF Layer Functions
- * - Pooling Functions
- * - Softmax Functions
- * - Basic math Functions
- *
- * The library has separate functions for operating on different weight and activation data
- * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
- * kernels are included in the function description. The implementation details are also
- * described in this paper [1].
- *
- * Function Classification
- * --------
- * The functions can be classified into two segments
- * - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
- * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
- *
- * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
- * The article in [2] describes in detail how to run a network using the legacy functions.
- *
- * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
- * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
- * a TensorFlow Lite model using optimized CMSIS-NN kernels.
- *
- * Block Diagram
- * --------
- * \image html CMSIS-NN-OVERVIEW.PNG
- *
- * Examples
- * --------
- *
- * The library ships with a number of examples which demonstrate how to use the library functions.
- *
- * Pre-processor Macros
- * ------------
- *
- * Each library project have different pre-processor macros.
- *
- * - ARM_MATH_DSP:
- *
- * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
- *
- * - ARM_MATH_MVEI:
- *
- * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
- * - ARM_MATH_AUTOVECTORIZE
- * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
- * assembly. It does not affect functions that use C or intrinsics.
- * - ARM_MATH_BIG_ENDIAN:
- *
- * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
- * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
- * little endian targets.
- *
- * - ARM_NN_TRUNCATE:
- *
- * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
- *
- *
- * Copyright Notice
- * ------------
- *
- * Copyright (C) 2010-2019 Arm Limited. All rights reserved.
- *
- * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
- *
- * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
- *
- https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
- * [3] https://www.tensorflow.org/lite/microcontrollers/library
- *
- * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
- */
- /**
- * @defgroup groupNN Neural Network Functions
- * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
- * TensorFlow Lite framework.
- */
- #ifndef _ARM_NNFUNCTIONS_H
- #define _ARM_NNFUNCTIONS_H
- #include "hpm_math.h"
- #define USE_INTRINSIC
- //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
- #ifdef __cplusplus
- extern "C" {
- #endif
- /**
- * @defgroup NNConv Convolution Functions
- *
- * Collection of convolution, depthwise convolution functions and their variants.
- *
- * The convolution is implemented in 2 steps: im2col and GEMM
- *
- * im2col is a process of converting each patch of image data into
- * a column. After im2col, the convolution is computed as matrix-matrix
- * multiplication.
- *
- * To reduce the memory footprint, the im2col is performed partially.
- * Each iteration, only a few column (i.e., patches) are generated and
- * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
- *
- */
- char arm_convolve_s8_4col(const q7_t *input,
- const uint16_t input_x,
- const uint16_t input_y,
- const uint16_t input_ch,
- const uint16_t input_batches,
- const q7_t *kernel,
- const uint16_t output_ch,
- const uint16_t kernel_x,
- const uint16_t kernel_y,
- const uint16_t pad_x,
- const uint16_t pad_y,
- const uint16_t stride_x,
- const uint16_t stride_y,
- const int32_t *bias,
- q7_t *output,
- const int32_t *output_shift,
- const int32_t *output_mult,
- const int32_t out_offset,
- const int32_t input_offset,
- const int32_t out_activation_min,
- const int32_t out_activation_max,
- const uint16_t output_x,
- const uint16_t output_y,
- q15_t *buffer_a);
- q7_t *hpm_nn_mat_mult_kernel_s8_s16_reordered_oddch(const q7_t *input_a,
- const q15_t *input_b,
- const uint16_t output_ch,
- const int32_t *out_shift,
- const int32_t *out_mult,
- const int32_t out_offset,
- const int16_t activation_min,
- const int16_t activation_max,
- const uint16_t num_col_a,
- const int32_t *const output_bias,
- q7_t *out_0);
- q7_t *hpm_nn_mat_mult_kernel_s8_s16_reordered_8mul(const q7_t *input_a,
- const q15_t *input_b,
- const uint16_t output_ch,
- const int32_t *out_shift,
- const int32_t *out_mult,
- const int32_t out_offset,
- const int16_t activation_min,
- const int16_t activation_max,
- const uint16_t num_col_a,
- const int32_t *const output_bias,
- q7_t *out_0);
- q7_t *hpm_nn_mat_mult_kernel3_input3_s8_s16(const q7_t *input_a,
- const q15_t *input_b,
- const uint16_t output_ch,
- const int32_t *out_shift,
- const int32_t *out_mult,
- const int32_t out_offset,
- const int16_t activation_min,
- const int16_t activation_max,
- const uint16_t num_col_a,
- const int32_t *const output_bias,
- q7_t *out_0,
- q15_t *kbuf);
- #ifdef __cplusplus
- }
- #endif
- #endif
|