| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997 |
- /* ----------------------------------------------------------------------
- * Project: Tiny Training Engine, MCUNetV3
- * Title: nnfunctions_fp.h
- *
- * Reference papers:
- * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
- * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
- * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
- * Contact authors:
- * - Wei-Chen Wang, wweichen@mit.edu
- * - Wei-Ming Chen, wmchen@mit.edu
- * - Ji Lin, jilin@mit.edu
- * - Ligeng Zhu, ligeng@mit.edu
- * - Song Han, songhan@mit.edu
- * - Chuang Gan, ganchuang@csail.mit.edu
- *
- * Target ISA: ARMv7E-M
- * -------------------------------------------------------------------- */
- #include "nnfunctions.h"
- /* START: MAC Functions for Pointwise Conv */
- static inline void mac_1row_10col_fp_IOHW_forint8w(float* sum, const float* input_0,
- const float filter_0, const float filter_1, const float filter_2, const float filter_3, const float filter_4,
- const float filter_5, const float filter_6, const float filter_7, const float filter_8, const float filter_9) {
- *sum += *input_0++ * filter_0;
- *sum += *input_0++ * filter_1;
- *sum += *input_0++ * filter_2;
- *sum += *input_0++ * filter_3;
- *sum += *input_0++ * filter_4;
- *sum += *input_0++ * filter_5;
- *sum += *input_0++ * filter_6;
- *sum += *input_0++ * filter_7;
- *sum += *input_0++ * filter_8;
- *sum += *input_0++ * filter_9;
- }
- static inline void mac_4row_4col_fp_IOHW_forint8w(float* sum, const float* input_0, const float* input_1, const float* input_2, const float* input_3,
- const float filter_0, const float filter_1, const float filter_2, const float filter_3) {
- *sum += *input_0++ * filter_0;
- *sum += *input_0++ * filter_1;
- *sum += *input_0++ * filter_2;
- *sum++ += *input_0++ * filter_3;
- *sum += *input_1++ * filter_0;
- *sum += *input_1++ * filter_1;
- *sum += *input_1++ * filter_2;
- *sum++ += *input_1++ * filter_3;
- *sum += *input_2++ * filter_0;
- *sum += *input_2++ * filter_1;
- *sum += *input_2++ * filter_2;
- *sum++ += *input_2++ * filter_3;
- *sum += *input_3++ * filter_0;
- *sum += *input_3++ * filter_1;
- *sum += *input_3++ * filter_2;
- *sum++ += *input_3++ * filter_3;
- }
- static inline void mac_1row_4col_fp_IOHW_forint8w(float* sum, const float* input_0,
- const float filter_0, const float filter_1, const float filter_2, const float filter_3) {
- *sum += *input_0++ * filter_0;
- *sum += *input_0++ * filter_1;
- *sum += *input_0++ * filter_2;
- *sum += *input_0++ * filter_3;
- }
- /* END: MAC Functions for Pointwise Conv */
- /* START: MAC Functions for Group Conv */
- static inline void group_mac_kernel4_4row_fp_uniweight_reuse_output_input(float* sum_0, float* sum_1, float* sum_2, float* sum_3,
- const float* input_0, const float* input_1, const float* input_2, const float* input_3,
- const float* filter) {
- float tmp;
- tmp = 0;
- tmp += input_0[0] * filter[0];
- tmp += input_0[1] * filter[1];
- tmp += input_0[2] * filter[2];
- tmp += input_0[3] * filter[3];
- tmp += input_0[4] * filter[4];
- tmp += input_0[5] * filter[5];
- tmp += input_0[6] * filter[6];
- tmp += input_0[7] * filter[7];
- tmp += input_0[8] * filter[8];
- tmp += input_0[9] * filter[9];
- tmp += input_0[10] * filter[10];
- tmp += input_0[11] * filter[11];
- tmp += input_0[12] * filter[12];
- tmp += input_0[13] * filter[13];
- tmp += input_0[14] * filter[14];
- tmp += input_0[15] * filter[15];
- *sum_0 += tmp;
- tmp = 0;
- tmp += input_1[0] * filter[0];
- tmp += input_1[1] * filter[1];
- tmp += input_1[2] * filter[2];
- tmp += input_1[3] * filter[3];
- tmp += input_1[4] * filter[4];
- tmp += input_1[5] * filter[5];
- tmp += input_1[6] * filter[6];
- tmp += input_1[7] * filter[7];
- tmp += input_1[8] * filter[8];
- tmp += input_1[9] * filter[9];
- tmp += input_1[10] * filter[10];
- tmp += input_1[11] * filter[11];
- tmp += input_1[12] * filter[12];
- tmp += input_1[13] * filter[13];
- tmp += input_1[14] * filter[14];
- tmp += input_1[15] * filter[15];
- *sum_1 += tmp;
- tmp = 0;
- tmp += input_2[0] * filter[0];
- tmp += input_2[1] * filter[1];
- tmp += input_2[2] * filter[2];
- tmp += input_2[3] * filter[3];
- tmp += input_2[4] * filter[4];
- tmp += input_2[5] * filter[5];
- tmp += input_2[6] * filter[6];
- tmp += input_2[7] * filter[7];
- tmp += input_2[8] * filter[8];
- tmp += input_2[9] * filter[9];
- tmp += input_2[10] * filter[10];
- tmp += input_2[11] * filter[11];
- tmp += input_2[12] * filter[12];
- tmp += input_2[13] * filter[13];
- tmp += input_2[14] * filter[14];
- tmp += input_2[15] * filter[15];
- *sum_2 += tmp;
- tmp = 0;
- tmp += input_3[0] * filter[0];
- tmp += input_3[1] * filter[1];
- tmp += input_3[2] * filter[2];
- tmp += input_3[3] * filter[3];
- tmp += input_3[4] * filter[4];
- tmp += input_3[5] * filter[5];
- tmp += input_3[6] * filter[6];
- tmp += input_3[7] * filter[7];
- tmp += input_3[8] * filter[8];
- tmp += input_3[9] * filter[9];
- tmp += input_3[10] * filter[10];
- tmp += input_3[11] * filter[11];
- tmp += input_3[12] * filter[12];
- tmp += input_3[13] * filter[13];
- tmp += input_3[14] * filter[14];
- tmp += input_3[15] * filter[15];
- *sum_3 += tmp;
- }
- static inline void group_mac_kernel8_4row_fp_uniweight_reuse_output_input(float* sum_0, float* sum_1, float* sum_2, float* sum_3,
- const float* input_0, const float* input_1, const float* input_2, const float* input_3,
- const float* filter) {
- float tmp;
- tmp = 0;
- tmp += input_0[0] * filter[0];
- tmp += input_0[1] * filter[1];
- tmp += input_0[2] * filter[2];
- tmp += input_0[3] * filter[3];
- tmp += input_0[4] * filter[4];
- tmp += input_0[5] * filter[5];
- tmp += input_0[6] * filter[6];
- tmp += input_0[7] * filter[7];
- tmp += input_0[8] * filter[8];
- tmp += input_0[9] * filter[9];
- tmp += input_0[10] * filter[10];
- tmp += input_0[11] * filter[11];
- tmp += input_0[12] * filter[12];
- tmp += input_0[13] * filter[13];
- tmp += input_0[14] * filter[14];
- tmp += input_0[15] * filter[15];
- tmp += input_0[16] * filter[16];
- tmp += input_0[17] * filter[17];
- tmp += input_0[18] * filter[18];
- tmp += input_0[19] * filter[19];
- tmp += input_0[20] * filter[20];
- tmp += input_0[21] * filter[21];
- tmp += input_0[22] * filter[22];
- tmp += input_0[23] * filter[23];
- tmp += input_0[24] * filter[24];
- tmp += input_0[25] * filter[25];
- tmp += input_0[26] * filter[26];
- tmp += input_0[27] * filter[27];
- tmp += input_0[28] * filter[28];
- tmp += input_0[29] * filter[29];
- tmp += input_0[30] * filter[30];
- tmp += input_0[31] * filter[31];
- tmp += input_0[32] * filter[32];
- tmp += input_0[33] * filter[33];
- tmp += input_0[34] * filter[34];
- tmp += input_0[35] * filter[35];
- tmp += input_0[36] * filter[36];
- tmp += input_0[37] * filter[37];
- tmp += input_0[38] * filter[38];
- tmp += input_0[39] * filter[39];
- tmp += input_0[40] * filter[40];
- tmp += input_0[41] * filter[41];
- tmp += input_0[42] * filter[42];
- tmp += input_0[43] * filter[43];
- tmp += input_0[44] * filter[44];
- tmp += input_0[45] * filter[45];
- tmp += input_0[46] * filter[46];
- tmp += input_0[47] * filter[47];
- tmp += input_0[48] * filter[48];
- tmp += input_0[49] * filter[49];
- tmp += input_0[50] * filter[50];
- tmp += input_0[51] * filter[51];
- tmp += input_0[52] * filter[52];
- tmp += input_0[53] * filter[53];
- tmp += input_0[54] * filter[54];
- tmp += input_0[55] * filter[55];
- tmp += input_0[56] * filter[56];
- tmp += input_0[57] * filter[57];
- tmp += input_0[58] * filter[58];
- tmp += input_0[59] * filter[59];
- tmp += input_0[60] * filter[60];
- tmp += input_0[61] * filter[61];
- tmp += input_0[62] * filter[62];
- tmp += input_0[63] * filter[63];
- *sum_0 += tmp;
- tmp = 0;
- tmp += input_1[0] * filter[0];
- tmp += input_1[1] * filter[1];
- tmp += input_1[2] * filter[2];
- tmp += input_1[3] * filter[3];
- tmp += input_1[4] * filter[4];
- tmp += input_1[5] * filter[5];
- tmp += input_1[6] * filter[6];
- tmp += input_1[7] * filter[7];
- tmp += input_1[8] * filter[8];
- tmp += input_1[9] * filter[9];
- tmp += input_1[10] * filter[10];
- tmp += input_1[11] * filter[11];
- tmp += input_1[12] * filter[12];
- tmp += input_1[13] * filter[13];
- tmp += input_1[14] * filter[14];
- tmp += input_1[15] * filter[15];
- tmp += input_1[16] * filter[16];
- tmp += input_1[17] * filter[17];
- tmp += input_1[18] * filter[18];
- tmp += input_1[19] * filter[19];
- tmp += input_1[20] * filter[20];
- tmp += input_1[21] * filter[21];
- tmp += input_1[22] * filter[22];
- tmp += input_1[23] * filter[23];
- tmp += input_1[24] * filter[24];
- tmp += input_1[25] * filter[25];
- tmp += input_1[26] * filter[26];
- tmp += input_1[27] * filter[27];
- tmp += input_1[28] * filter[28];
- tmp += input_1[29] * filter[29];
- tmp += input_1[30] * filter[30];
- tmp += input_1[31] * filter[31];
- tmp += input_1[32] * filter[32];
- tmp += input_1[33] * filter[33];
- tmp += input_1[34] * filter[34];
- tmp += input_1[35] * filter[35];
- tmp += input_1[36] * filter[36];
- tmp += input_1[37] * filter[37];
- tmp += input_1[38] * filter[38];
- tmp += input_1[39] * filter[39];
- tmp += input_1[40] * filter[40];
- tmp += input_1[41] * filter[41];
- tmp += input_1[42] * filter[42];
- tmp += input_1[43] * filter[43];
- tmp += input_1[44] * filter[44];
- tmp += input_1[45] * filter[45];
- tmp += input_1[46] * filter[46];
- tmp += input_1[47] * filter[47];
- tmp += input_1[48] * filter[48];
- tmp += input_1[49] * filter[49];
- tmp += input_1[50] * filter[50];
- tmp += input_1[51] * filter[51];
- tmp += input_1[52] * filter[52];
- tmp += input_1[53] * filter[53];
- tmp += input_1[54] * filter[54];
- tmp += input_1[55] * filter[55];
- tmp += input_1[56] * filter[56];
- tmp += input_1[57] * filter[57];
- tmp += input_1[58] * filter[58];
- tmp += input_1[59] * filter[59];
- tmp += input_1[60] * filter[60];
- tmp += input_1[61] * filter[61];
- tmp += input_1[62] * filter[62];
- tmp += input_1[63] * filter[63];
- *sum_1 += tmp;
- tmp = 0;
- tmp += input_2[0] * filter[0];
- tmp += input_2[1] * filter[1];
- tmp += input_2[2] * filter[2];
- tmp += input_2[3] * filter[3];
- tmp += input_2[4] * filter[4];
- tmp += input_2[5] * filter[5];
- tmp += input_2[6] * filter[6];
- tmp += input_2[7] * filter[7];
- tmp += input_2[8] * filter[8];
- tmp += input_2[9] * filter[9];
- tmp += input_2[10] * filter[10];
- tmp += input_2[11] * filter[11];
- tmp += input_2[12] * filter[12];
- tmp += input_2[13] * filter[13];
- tmp += input_2[14] * filter[14];
- tmp += input_2[15] * filter[15];
- tmp += input_2[16] * filter[16];
- tmp += input_2[17] * filter[17];
- tmp += input_2[18] * filter[18];
- tmp += input_2[19] * filter[19];
- tmp += input_2[20] * filter[20];
- tmp += input_2[21] * filter[21];
- tmp += input_2[22] * filter[22];
- tmp += input_2[23] * filter[23];
- tmp += input_2[24] * filter[24];
- tmp += input_2[25] * filter[25];
- tmp += input_2[26] * filter[26];
- tmp += input_2[27] * filter[27];
- tmp += input_2[28] * filter[28];
- tmp += input_2[29] * filter[29];
- tmp += input_2[30] * filter[30];
- tmp += input_2[31] * filter[31];
- tmp += input_2[32] * filter[32];
- tmp += input_2[33] * filter[33];
- tmp += input_2[34] * filter[34];
- tmp += input_2[35] * filter[35];
- tmp += input_2[36] * filter[36];
- tmp += input_2[37] * filter[37];
- tmp += input_2[38] * filter[38];
- tmp += input_2[39] * filter[39];
- tmp += input_2[40] * filter[40];
- tmp += input_2[41] * filter[41];
- tmp += input_2[42] * filter[42];
- tmp += input_2[43] * filter[43];
- tmp += input_2[44] * filter[44];
- tmp += input_2[45] * filter[45];
- tmp += input_2[46] * filter[46];
- tmp += input_2[47] * filter[47];
- tmp += input_2[48] * filter[48];
- tmp += input_2[49] * filter[49];
- tmp += input_2[50] * filter[50];
- tmp += input_2[51] * filter[51];
- tmp += input_2[52] * filter[52];
- tmp += input_2[53] * filter[53];
- tmp += input_2[54] * filter[54];
- tmp += input_2[55] * filter[55];
- tmp += input_2[56] * filter[56];
- tmp += input_2[57] * filter[57];
- tmp += input_2[58] * filter[58];
- tmp += input_2[59] * filter[59];
- tmp += input_2[60] * filter[60];
- tmp += input_2[61] * filter[61];
- tmp += input_2[62] * filter[62];
- tmp += input_2[63] * filter[63];
- *sum_2 += tmp;
- tmp = 0;
- tmp += input_3[0] * filter[0];
- tmp += input_3[1] * filter[1];
- tmp += input_3[2] * filter[2];
- tmp += input_3[3] * filter[3];
- tmp += input_3[4] * filter[4];
- tmp += input_3[5] * filter[5];
- tmp += input_3[6] * filter[6];
- tmp += input_3[7] * filter[7];
- tmp += input_3[8] * filter[8];
- tmp += input_3[9] * filter[9];
- tmp += input_3[10] * filter[10];
- tmp += input_3[11] * filter[11];
- tmp += input_3[12] * filter[12];
- tmp += input_3[13] * filter[13];
- tmp += input_3[14] * filter[14];
- tmp += input_3[15] * filter[15];
- tmp += input_3[16] * filter[16];
- tmp += input_3[17] * filter[17];
- tmp += input_3[18] * filter[18];
- tmp += input_3[19] * filter[19];
- tmp += input_3[20] * filter[20];
- tmp += input_3[21] * filter[21];
- tmp += input_3[22] * filter[22];
- tmp += input_3[23] * filter[23];
- tmp += input_3[24] * filter[24];
- tmp += input_3[25] * filter[25];
- tmp += input_3[26] * filter[26];
- tmp += input_3[27] * filter[27];
- tmp += input_3[28] * filter[28];
- tmp += input_3[29] * filter[29];
- tmp += input_3[30] * filter[30];
- tmp += input_3[31] * filter[31];
- tmp += input_3[32] * filter[32];
- tmp += input_3[33] * filter[33];
- tmp += input_3[34] * filter[34];
- tmp += input_3[35] * filter[35];
- tmp += input_3[36] * filter[36];
- tmp += input_3[37] * filter[37];
- tmp += input_3[38] * filter[38];
- tmp += input_3[39] * filter[39];
- tmp += input_3[40] * filter[40];
- tmp += input_3[41] * filter[41];
- tmp += input_3[42] * filter[42];
- tmp += input_3[43] * filter[43];
- tmp += input_3[44] * filter[44];
- tmp += input_3[45] * filter[45];
- tmp += input_3[46] * filter[46];
- tmp += input_3[47] * filter[47];
- tmp += input_3[48] * filter[48];
- tmp += input_3[49] * filter[49];
- tmp += input_3[50] * filter[50];
- tmp += input_3[51] * filter[51];
- tmp += input_3[52] * filter[52];
- tmp += input_3[53] * filter[53];
- tmp += input_3[54] * filter[54];
- tmp += input_3[55] * filter[55];
- tmp += input_3[56] * filter[56];
- tmp += input_3[57] * filter[57];
- tmp += input_3[58] * filter[58];
- tmp += input_3[59] * filter[59];
- tmp += input_3[60] * filter[60];
- tmp += input_3[61] * filter[61];
- tmp += input_3[62] * filter[62];
- tmp += input_3[63] * filter[63];
- *sum_3 += tmp;
- }
- /* END: MAC Functions for Group Conv */
- /* START: MAC Functions for Transpose Depthwise Conv */
- /* START: For 3x3 kernel size*/
- static inline void transpose_depthwise_mac_kernel3_2row_fp_uniweight(float* sum_0, float* sum_1,
- const float* two_column_buffer, const float* ksrc_transposed, const uint16_t input_width,
- const uint16_t STRIDE, const uint16_t IN_PAD, const uint16_t OUT_PAD) {
- *sum_0 += two_column_buffer[0] * ksrc_transposed[0];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[0];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[1];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[1];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[2];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[2];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[3];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[3];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[4];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[4];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[5];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[5];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[6];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[6];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[7];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[7];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[8];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[8];
- }
- static inline void transpose_depthwise_mac_kernel3_1row_fp_uniweight(float* sum_0,
- const float* two_column_buffer, const float* ksrc_transposed, const uint16_t input_width,
- const uint16_t STRIDE, const uint16_t IN_PAD, const uint16_t OUT_PAD) {
- *sum_0 += two_column_buffer[0] * ksrc_transposed[0];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[1];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[2];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[3];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[4];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[5];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[6];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[7];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[8];
- }
- /* END: For 3x3 kernel size*/
- /* START: For 5x5 kernel size*/
- static inline void transpose_depthwise_mac_kernel5_2row_fp_uniweight(float* sum_0, float* sum_1,
- const float* two_column_buffer, const float* ksrc_transposed, const uint16_t input_width,
- const uint16_t STRIDE, const uint16_t IN_PAD, const uint16_t OUT_PAD) {
- *sum_0 += two_column_buffer[0] * ksrc_transposed[0];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[0];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[1];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[1];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[2];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[2];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[3];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[3];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[4];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[4];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[5];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[5];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[6];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[6];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[7];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[7];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[8];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[8];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[9];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[9];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[10];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[10];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[11];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[11];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[12];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[12];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[13];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[13];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[14];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[14];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[15];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[15];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[16];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[16];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[17];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[17];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[18];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[18];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[19];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[19];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[20];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[20];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[21];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[21];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[22];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[22];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[23];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[23];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[24];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[24];
- }
- static inline void transpose_depthwise_mac_kernel5_1row_fp_uniweight(float* sum_0,
- const float* two_column_buffer, const float* ksrc_transposed, const uint16_t input_width,
- const uint16_t STRIDE, const uint16_t IN_PAD, const uint16_t OUT_PAD) {
- *sum_0 += two_column_buffer[0] * ksrc_transposed[0];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[1];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[2];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[3];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[4];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[5];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[6];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[7];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[8];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[9];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[10];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[11];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[12];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[13];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[14];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[15];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[16];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[17];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[18];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[19];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[20];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[21];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[22];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[23];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[24];
- }
- /* END: For 5x5 kernel size*/
- /* START: For 7x7 kernel size*/
- static inline void transpose_depthwise_mac_kernel7_2row_fp_uniweight(float* sum_0, float* sum_1,
- const float* two_column_buffer, const float* ksrc_transposed, const uint16_t input_width,
- const uint16_t STRIDE, const uint16_t IN_PAD, const uint16_t OUT_PAD) {
- *sum_0 += two_column_buffer[0] * ksrc_transposed[0];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[0];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[1];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[1];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[2];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[2];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[3];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[3];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[4];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[4];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[5];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[5];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[6];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[6];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[7];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[7];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[8];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[8];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[9];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[9];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[10];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[10];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[11];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[11];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[12];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[12];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[13];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[13];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[14];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[14];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[15];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[15];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[16];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[16];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[17];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[17];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[18];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[18];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[19];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[19];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[20];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[20];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[21];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[21];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[22];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[22];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[23];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[23];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[24];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[24];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[25];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[25];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[26];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[26];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[27];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[27];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[28];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[28];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[29];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[29];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[30];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[30];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[31];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[31];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[32];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[32];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[33];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[33];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[34];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[34];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[35];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[35];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[36];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[36];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[37];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[37];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[38];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[38];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[39];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[39];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[40];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[40];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[41];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[41];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[42];
- *sum_1 += two_column_buffer[1] * ksrc_transposed[42];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[43];
- *sum_1 += two_column_buffer[2] * ksrc_transposed[43];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[44];
- *sum_1 += two_column_buffer[3] * ksrc_transposed[44];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[45];
- *sum_1 += two_column_buffer[4] * ksrc_transposed[45];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[46];
- *sum_1 += two_column_buffer[5] * ksrc_transposed[46];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[47];
- *sum_1 += two_column_buffer[6] * ksrc_transposed[47];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[48];
- *sum_1 += two_column_buffer[7] * ksrc_transposed[48];
- }
- static inline void transpose_depthwise_mac_kernel7_1row_fp_uniweight(float* sum_0,
- const float* two_column_buffer, const float* ksrc_transposed, const uint16_t input_width,
- const uint16_t STRIDE, const uint16_t IN_PAD, const uint16_t OUT_PAD) {
- *sum_0 += two_column_buffer[0] * ksrc_transposed[0];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[1];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[2];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[3];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[4];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[5];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[6];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[7];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[8];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[9];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[10];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[11];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[12];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[13];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
-
- *sum_0 += two_column_buffer[0] * ksrc_transposed[14];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[15];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[16];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[17];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[18];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[19];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[20];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[21];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[22];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[23];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[24];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[25];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[26];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[27];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[28];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[29];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[30];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[31];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[32];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[33];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[34];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[35];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[36];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[37];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[38];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[39];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[40];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[41];
- two_column_buffer += (input_width - 1) * STRIDE + 1 + IN_PAD * 2 + OUT_PAD;
- *sum_0 += two_column_buffer[0] * ksrc_transposed[42];
- *sum_0 += two_column_buffer[1] * ksrc_transposed[43];
- *sum_0 += two_column_buffer[2] * ksrc_transposed[44];
- *sum_0 += two_column_buffer[3] * ksrc_transposed[45];
- *sum_0 += two_column_buffer[4] * ksrc_transposed[46];
- *sum_0 += two_column_buffer[5] * ksrc_transposed[47];
- *sum_0 += two_column_buffer[6] * ksrc_transposed[48];
- }
- /* END: For 7x7 kernel size*/
- /* END: MAC Functions for Transpose Depthwise Conv */
- /* START: Assign Output Functions */
- /* START: For Group Conv */
- static inline void assign_sum_to_group_output_4row16col(int8_t* out_0, int8_t* out_1, int8_t* out_2, int8_t* out_3, int8_t* out_4, int8_t* out_5, int8_t* out_6, int8_t* out_7,
- int8_t* out_8, int8_t* out_9, int8_t* out_10, int8_t* out_11, int8_t* out_12, int8_t* out_13, int8_t* out_14, int8_t* out_15,
- const float* sum_0, const float* sum_1, const float* sum_2, const float* sum_3,
- const float output_activation_min, const float output_activation_max, const float* scales, const float learning_rate, const int i_output_depth) {
- *out_0++ -= TN_MIN(TN_MAX(sum_0[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_0[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_0[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_0[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_0[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_0[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_0[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_0[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_8++ -= TN_MIN(TN_MAX(sum_0[8], output_activation_min), output_activation_max) * scales[i_output_depth + 8] * learning_rate;
- *out_9++ -= TN_MIN(TN_MAX(sum_0[9], output_activation_min), output_activation_max) * scales[i_output_depth + 9] * learning_rate;
- *out_10++ -= TN_MIN(TN_MAX(sum_0[10], output_activation_min), output_activation_max) * scales[i_output_depth + 10] * learning_rate;
- *out_11++ -= TN_MIN(TN_MAX(sum_0[11], output_activation_min), output_activation_max) * scales[i_output_depth + 11] * learning_rate;
- *out_12++ -= TN_MIN(TN_MAX(sum_0[12], output_activation_min), output_activation_max) * scales[i_output_depth + 12] * learning_rate;
- *out_13++ -= TN_MIN(TN_MAX(sum_0[13], output_activation_min), output_activation_max) * scales[i_output_depth + 13] * learning_rate;
- *out_14++ -= TN_MIN(TN_MAX(sum_0[14], output_activation_min), output_activation_max) * scales[i_output_depth + 14] * learning_rate;
- *out_15++ -= TN_MIN(TN_MAX(sum_0[15], output_activation_min), output_activation_max) * scales[i_output_depth + 15] * learning_rate;
- *out_0++ -= TN_MIN(TN_MAX(sum_1[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_1[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_1[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_1[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_1[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_1[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_1[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_1[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_8++ -= TN_MIN(TN_MAX(sum_1[8], output_activation_min), output_activation_max) * scales[i_output_depth + 8] * learning_rate;
- *out_9++ -= TN_MIN(TN_MAX(sum_1[9], output_activation_min), output_activation_max) * scales[i_output_depth + 9] * learning_rate;
- *out_10++ -= TN_MIN(TN_MAX(sum_1[10], output_activation_min), output_activation_max) * scales[i_output_depth + 10] * learning_rate;
- *out_11++ -= TN_MIN(TN_MAX(sum_1[11], output_activation_min), output_activation_max) * scales[i_output_depth + 11] * learning_rate;
- *out_12++ -= TN_MIN(TN_MAX(sum_1[12], output_activation_min), output_activation_max) * scales[i_output_depth + 12] * learning_rate;
- *out_13++ -= TN_MIN(TN_MAX(sum_1[13], output_activation_min), output_activation_max) * scales[i_output_depth + 13] * learning_rate;
- *out_14++ -= TN_MIN(TN_MAX(sum_1[14], output_activation_min), output_activation_max) * scales[i_output_depth + 14] * learning_rate;
- *out_15++ -= TN_MIN(TN_MAX(sum_1[15], output_activation_min), output_activation_max) * scales[i_output_depth + 15] * learning_rate;
- *out_0++ -= TN_MIN(TN_MAX(sum_2[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_2[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_2[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_2[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_2[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_2[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_2[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_2[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_8++ -= TN_MIN(TN_MAX(sum_2[8], output_activation_min), output_activation_max) * scales[i_output_depth + 8] * learning_rate;
- *out_9++ -= TN_MIN(TN_MAX(sum_2[9], output_activation_min), output_activation_max) * scales[i_output_depth + 9] * learning_rate;
- *out_10++ -= TN_MIN(TN_MAX(sum_2[10], output_activation_min), output_activation_max) * scales[i_output_depth + 10] * learning_rate;
- *out_11++ -= TN_MIN(TN_MAX(sum_2[11], output_activation_min), output_activation_max) * scales[i_output_depth + 11] * learning_rate;
- *out_12++ -= TN_MIN(TN_MAX(sum_2[12], output_activation_min), output_activation_max) * scales[i_output_depth + 12] * learning_rate;
- *out_13++ -= TN_MIN(TN_MAX(sum_2[13], output_activation_min), output_activation_max) * scales[i_output_depth + 13] * learning_rate;
- *out_14++ -= TN_MIN(TN_MAX(sum_2[14], output_activation_min), output_activation_max) * scales[i_output_depth + 14] * learning_rate;
- *out_15++ -= TN_MIN(TN_MAX(sum_2[15], output_activation_min), output_activation_max) * scales[i_output_depth + 15] * learning_rate;
- *out_0++ -= TN_MIN(TN_MAX(sum_3[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_3[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_3[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_3[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_3[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_3[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_3[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_3[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_8++ -= TN_MIN(TN_MAX(sum_3[8], output_activation_min), output_activation_max) * scales[i_output_depth + 8] * learning_rate;
- *out_9++ -= TN_MIN(TN_MAX(sum_3[9], output_activation_min), output_activation_max) * scales[i_output_depth + 9] * learning_rate;
- *out_10++ -= TN_MIN(TN_MAX(sum_3[10], output_activation_min), output_activation_max) * scales[i_output_depth + 10] * learning_rate;
- *out_11++ -= TN_MIN(TN_MAX(sum_3[11], output_activation_min), output_activation_max) * scales[i_output_depth + 11] * learning_rate;
- *out_12++ -= TN_MIN(TN_MAX(sum_3[12], output_activation_min), output_activation_max) * scales[i_output_depth + 12] * learning_rate;
- *out_13++ -= TN_MIN(TN_MAX(sum_3[13], output_activation_min), output_activation_max) * scales[i_output_depth + 13] * learning_rate;
- *out_14++ -= TN_MIN(TN_MAX(sum_3[14], output_activation_min), output_activation_max) * scales[i_output_depth + 14] * learning_rate;
- *out_15++ -= TN_MIN(TN_MAX(sum_3[15], output_activation_min), output_activation_max) * scales[i_output_depth + 15] * learning_rate;
- }
- static inline void assign_sum_to_group_output_4row8col(int8_t* out_0, int8_t* out_1, int8_t* out_2, int8_t* out_3, int8_t* out_4, int8_t* out_5, int8_t* out_6, int8_t* out_7,
- const float* sum_0, const float* sum_1, const float* sum_2, const float* sum_3,
- const float output_activation_min, const float output_activation_max, const float* scales, const float learning_rate, const int i_output_depth) {
- *out_0++ -= TN_MIN(TN_MAX(sum_0[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_0[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_0[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_0[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_0[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_0[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_0[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_0[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_0++ -= TN_MIN(TN_MAX(sum_1[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_1[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_1[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_1[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_1[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_1[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_1[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_1[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_0++ -= TN_MIN(TN_MAX(sum_2[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_2[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_2[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_2[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_2[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_2[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_2[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_2[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- *out_0++ -= TN_MIN(TN_MAX(sum_3[0], output_activation_min), output_activation_max) * scales[i_output_depth] * learning_rate;
- *out_1++ -= TN_MIN(TN_MAX(sum_3[1], output_activation_min), output_activation_max) * scales[i_output_depth + 1] * learning_rate;
- *out_2++ -= TN_MIN(TN_MAX(sum_3[2], output_activation_min), output_activation_max) * scales[i_output_depth + 2] * learning_rate;
- *out_3++ -= TN_MIN(TN_MAX(sum_3[3], output_activation_min), output_activation_max) * scales[i_output_depth + 3] * learning_rate;
- *out_4++ -= TN_MIN(TN_MAX(sum_3[4], output_activation_min), output_activation_max) * scales[i_output_depth + 4] * learning_rate;
- *out_5++ -= TN_MIN(TN_MAX(sum_3[5], output_activation_min), output_activation_max) * scales[i_output_depth + 5] * learning_rate;
- *out_6++ -= TN_MIN(TN_MAX(sum_3[6], output_activation_min), output_activation_max) * scales[i_output_depth + 6] * learning_rate;
- *out_7++ -= TN_MIN(TN_MAX(sum_3[7], output_activation_min), output_activation_max) * scales[i_output_depth + 7] * learning_rate;
- }
- /* END: For Group Conv */
- /* START: For Pointwise Conv */
- static inline void assign_sum_to_pointwise_output_4row8col(float* out_0, float* out_1, float* out_2, float* out_3,
- const float* sum, const float output_activation_min, const float output_activation_max) {
- *out_0++ += TN_MIN(TN_MAX(sum[0], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[1], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[2], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[3], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[4], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[5], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[6], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[7], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[8], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[9], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[10], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[11], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[12], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[13], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[14], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[15], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[16], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[17], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[18], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[19], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[20], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[21], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[22], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[23], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[24], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[25], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[26], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[27], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[28], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[29], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[30], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[31], output_activation_min), output_activation_max);
- }
- static inline void assign_sum_to_pointwise_output_1row8col(float* out_0,
- const float* sum, const float output_activation_min, const float output_activation_max) {
- *out_0++ += TN_MIN(TN_MAX(sum[0], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[1], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[2], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[3], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[4], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[5], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[6], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[7], output_activation_min), output_activation_max);
- }
- static inline void assign_sum_to_pointwise_output_4row8col_noMINMAX(float* out_0, float* out_1, float* out_2, float* out_3, const float* sum) {
- *out_0++ += sum[0];
- *out_1++ += sum[1];
- *out_2++ += sum[2];
- *out_3++ += sum[3];
- *out_0++ += sum[4];
- *out_1++ += sum[5];
- *out_2++ += sum[6];
- *out_3++ += sum[7];
- *out_0++ += sum[8];
- *out_1++ += sum[9];
- *out_2++ += sum[10];
- *out_3++ += sum[11];
- *out_0++ += sum[12];
- *out_1++ += sum[13];
- *out_2++ += sum[14];
- *out_3++ += sum[15];
- *out_0++ += sum[16];
- *out_1++ += sum[17];
- *out_2++ += sum[18];
- *out_3++ += sum[19];
- *out_0++ += sum[20];
- *out_1++ += sum[21];
- *out_2++ += sum[22];
- *out_3++ += sum[23];
- *out_0++ += sum[24];
- *out_1++ += sum[25];
- *out_2++ += sum[26];
- *out_3++ += sum[27];
- *out_0++ += sum[28];
- *out_1++ += sum[29];
- *out_2++ += sum[30];
- *out_3++ += sum[31];
- }
- static inline void assign_sum_to_pointwise_output_1row8col_noMINMAX(float* out_0, const float* sum) {
- *out_0++ += sum[0];
- *out_0++ += sum[1];
- *out_0++ += sum[2];
- *out_0++ += sum[3];
- *out_0++ += sum[4];
- *out_0++ += sum[5];
- *out_0++ += sum[6];
- *out_0++ += sum[7];
- }
- static inline void assign_sum_to_pointwise_output_4row4col(float* out_0, float* out_1, float* out_2, float* out_3,
- const float* sum, const float output_activation_min, const float output_activation_max) {
- *out_0++ += TN_MIN(TN_MAX(sum[0], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[1], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[2], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[3], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[4], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[5], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[6], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[7], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[8], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[9], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[10], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[11], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[12], output_activation_min), output_activation_max);
- *out_1++ += TN_MIN(TN_MAX(sum[13], output_activation_min), output_activation_max);
- *out_2++ += TN_MIN(TN_MAX(sum[14], output_activation_min), output_activation_max);
- *out_3++ += TN_MIN(TN_MAX(sum[15], output_activation_min), output_activation_max);
- }
- static inline void assign_sum_to_pointwise_output_1row4col(float* out_0,
- const float* sum, const float output_activation_min, const float output_activation_max) {
- *out_0++ += TN_MIN(TN_MAX(sum[0], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[1], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[2], output_activation_min), output_activation_max);
- *out_0++ += TN_MIN(TN_MAX(sum[3], output_activation_min), output_activation_max);
- }
- /* END: For Pointwise Conv */
- /* END: Assign Output Functions */
|