| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489 |
- /* ----------------------------------------------------------------------
- * Project: TinyEngine
- * Title: img2col_element.h
- *
- * Reference papers:
- * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
- * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
- * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
- * Contact authors:
- * - Wei-Ming Chen, wmchen@mit.edu
- * - Wei-Chen Wang, wweichen@mit.edu
- * - Ji Lin, jilin@mit.edu
- * - Ligeng Zhu, ligeng@mit.edu
- * - Song Han, songhan@mit.edu
- *
- * Target ISA: ARMv7E-M
- * -------------------------------------------------------------------- */
- /*
- * Copyright (c) 2023 HPMicro
- *
- * SPDX-License-Identifier: BSD-3-Clause
- * Target ISA: RISCV D45
- *
- */
- #ifndef ARMNN_INCLUDE_IMG2COL_ELEMENT_H_
- #define ARMNN_INCLUDE_IMG2COL_ELEMENT_H_
- #include "hpm_math.h"
- #define b2_q7_q15_offset_ele(src, dst) \
- /* convert from q7 to q15 and then store the results in the destination buffer */ \
- /*in_q7x4 = b2_nn_read_q7x4_ia((const q7_t **)&src); \
- in_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
- in_q15x2_2 = __SXTB16(in_q7x4); */ \
- in_q15x2_1 = ((src[0] & 0x0C) >> 2) + ((src[0] & 0xC0) << 10); \
- in_q15x2_2 = (src[0] & 0x03) + ((src[0] & 0x30) << 12); \
- src += 1; \
- out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
- /* Maximum of 9 bits from the addition is expected */ \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_1); \
- write_q15x2_ia(&dst, out_q15x2_2);
- #define b4_q7_q15_offset_ele(src, dst) \
- /* convert from q7 to q15 and then store the results in the destination buffer */ \
- /*in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \
- in_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
- in_q15x2_2 = __SXTB16(in_q7x4); */ \
- in_q15x2_1 = ((src[0] & 0xF0) >> 4) + ((src[1] & 0xF0) << 12); \
- in_q15x2_2 = (src[0] & 0x0F) + ((src[1] & 0x0F) << 16); \
- src += 2; \
- out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
- /* Maximum of 9 bits from the addition is expected */ \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_1); \
- write_q15x2_ia(&dst, out_q15x2_2);
- #define q7_q15_offset_ele(src, dst) \
- /* convert from q7 to q15 and then store the results in the destination buffer */ \
- in_q7x4 = hpm_nn_read_q7x4_ia((const q7_t **)&src); \
- /* Extract and sign extend each of the four q7 values to q15 */ \
- in_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
- in_q15x2_2 = __SXTB16(in_q7x4); \
- \
- out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
- /* Maximum of 9 bits from the addition is expected */ \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_1); \
- write_q15x2_ia(&dst, out_q15x2_2);
- #define q8_q15_offset_ele(src, dst) \
- /* convert from q8 to q15 and then store the results in the destination buffer */ \
- in_q7x4 = hpm_nn_read_q7x4_ia((const q8_t **)&src); \
- /* Extend each of the four q8 values to q15 */ \
- in_q15x2_1 = __UXTB16(__ROR(in_q7x4, 8)); \
- in_q15x2_2 = __UXTB16(in_q7x4); \
- \
- out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
- /* Maximum of 9 bits from the addition is expected */ \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_1); \
- write_q15x2_ia(&dst, out_q15x2_2);
- #define b4_q15_offset_reordered_ele(src, dst) \
- /* convert from q7 to q15 and then store the results in the destination buffer */ \
- in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \
- \
- /* Extract and sign extend each of the four q7 values to q15 */ \
- out_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
- out_q15x2_2 = __SXTB16(in_q7x4); \
- \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_2); \
- write_q15x2_ia(&dst, out_q15x2_1);
- #define b2_q15_offset_reordered_ele(src, dst) \
- /* convert from q7 to q15 and then store the results in the destination buffer */ \
- in_q7x4 = b2_nn_read_q7x4_ia(&src); \
- \
- /* Extract and sign extend each of the four q7 values to q15 */ \
- out_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
- out_q15x2_2 = __SXTB16(in_q7x4); \
- \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_2); \
- write_q15x2_ia(&dst, out_q15x2_1);
- #define q7_q15_offset_reordered_ele(src, dst) \
- /* convert from q7 to q15 and then store the results in the destination buffer */ \
- in_q7x4 = hpm_nn_read_q7x4_ia((const q7_t **)&src); \
- \
- /* Extract and sign extend each of the four q7 values to q15 */ \
- out_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
- out_q15x2_2 = __SXTB16(in_q7x4); \
- \
- out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
- out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
- \
- write_q15x2_ia(&dst, out_q15x2_2); \
- write_q15x2_ia(&dst, out_q15x2_1);
- #define q31_assign2(src, dst) \
- *dst++ = *src++; \
- *dst++ = *src++;
- #define q31_assign4(src, dst) q31_assign2(src, dst) q31_assign2(src, dst)
- #define q31_assign6(src, dst) q31_assign4(src, dst) q31_assign2(src, dst)
- #define q31_assign8(src, dst) q31_assign4(src, dst) q31_assign4(src, dst)
- #define q31_assign10(src, dst) q31_assign8(src, dst) q31_assign2(src, dst)
- #define q31_assign12(src, dst) q31_assign10(src, dst) q31_assign2(src, dst)
- #define q31_pad2(dst, padvalue) \
- *dst++ = padvalue; \
- *dst++ = padvalue;
- #define q31_pad4(dst, padvalue) q31_pad2(dst, padvalue) q31_pad2(dst, padvalue)
- #define q31_pad6(dst, padvalue) q31_pad4(dst, padvalue) q31_pad2(dst, padvalue)
- #define q31_pad10(dst, padvalue) q31_pad6(dst, padvalue) q31_pad4(dst, padvalue)
- #define q31_pad14(dst, padvalue) q31_pad6(dst, padvalue) q31_pad6(dst, padvalue) q31_pad2(dst, padvalue)
- #define assignq31toq15() \
- dst = (q15_t *)dst_31; \
- dst2 = (q15_t *)dst2_31; \
- dst3 = (q15_t *)dst3_31; \
- dst4 = (q15_t *)dst4_31; \
- dst5 = (q15_t *)dst5_31; \
- dst6 = (q15_t *)dst6_31; \
- dst7 = (q15_t *)dst7_31;
- #define assignq15toq31() \
- dst_31 = (q31_t *)dst; \
- dst2_31 = (q31_t *)dst2; \
- dst3_31 = (q31_t *)dst3; \
- dst4_31 = (q31_t *)dst4; \
- dst5_31 = (q31_t *)dst5; \
- dst6_31 = (q31_t *)dst6; \
- dst7_31 = (q31_t *)dst7;
- /* ---------------------------------- Pad ---------------------------------- */
- #define basic_pad_1row(col, dst_31, pad_out_q15x2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_pad2(dst_31, pad_out_q15x2) block_cnt--; \
- }
- #define basic_pad_2row(col, dst_31, dst2_31, pad_out_q15x2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) block_cnt--; \
- }
- #define basic_pad_3row(col, dst_31, dst2_31, dst3_31, pad_out_q15x2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) block_cnt--; \
- }
- #define basic_pad_4row(col, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) \
- q31_pad2(dst4_31, pad_out_q15x2) block_cnt--; \
- }
- #define basic_pad_5row(col, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) \
- q31_pad2(dst4_31, pad_out_q15x2) q31_pad2(dst5_31, pad_out_q15x2) block_cnt--; \
- }
- #define pad_1row_1col(dst_31, pad_out_q15x2) basic_pad_1row(1, dst_31, pad_out_q15x2)
- #define pad_1row_2col(dst_31, pad_out_q15x2) basic_pad_1row(2, dst_31, pad_out_q15x2)
- #define pad_1row_3col(dst_31, pad_out_q15x2) basic_pad_1row(3, dst_31, pad_out_q15x2)
- #define pad_2row_1col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(1, dst_31, dst2_31, pad_out_q15x2)
- #define pad_2row_2col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(2, dst_31, dst2_31, pad_out_q15x2)
- #define pad_2row_3col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(3, dst_31, dst2_31, pad_out_q15x2)
- #define pad_2row_4col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(4, dst_31, dst2_31, pad_out_q15x2)
- #define pad_2row_5col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(5, dst_31, dst2_31, pad_out_q15x2)
- #define pad_3row_1col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \
- basic_pad_3row(1, dst_31, dst2_31, dst3_31, pad_out_q15x2)
- #define pad_3row_2col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \
- basic_pad_3row(2, dst_31, dst2_31, dst3_31, pad_out_q15x2)
- #define pad_3row_3col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \
- basic_pad_3row(3, dst_31, dst2_31, dst3_31, pad_out_q15x2)
- #define pad_4row_1col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
- basic_pad_4row(1, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2)
- #define pad_4row_2col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
- basic_pad_4row(2, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2)
- #define pad_4row_3col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
- basic_pad_4row(3, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2)
- #define pad_5row_1col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
- basic_pad_5row(1, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2)
- #define pad_5row_2col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
- basic_pad_5row(2, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2)
- #define pad_5row_3col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
- basic_pad_5row(3, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2)
- /* ---------------------------------- Load ---------------------------------- */
- #define basic_load_1row(col, src, dst) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q7_q15_offset_ele(src, dst) block_cnt--; \
- }
- #define basic_load_2row(col, src, src2, dst, dst2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) block_cnt--; \
- }
- #define basic_load_3row(col, src, src2, src3, dst, dst2, dst3) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) block_cnt--; \
- }
- #define basic_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) \
- q7_q15_offset_ele(src4, dst4) block_cnt--; \
- }
- #define basic_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) \
- q7_q15_offset_ele(src4, dst4) q7_q15_offset_ele(src5, dst5) block_cnt--; \
- }
- ///////////////////////// 4bit //////////////////////////
- #define b4_load_1row(col, src, dst) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b4_q7_q15_offset_ele(src, dst) block_cnt--; \
- }
- #define b4_load_2row(col, src, src2, dst, dst2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) block_cnt--; \
- }
- #define b4_load_3row(col, src, src2, src3, dst, dst2, dst3) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) block_cnt--; \
- }
- #define b4_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) \
- b4_q7_q15_offset_ele(src4, dst4) block_cnt--; \
- }
- #define b4_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) \
- b4_q7_q15_offset_ele(src4, dst4) b4_q7_q15_offset_ele(src5, dst5) block_cnt--; \
- }
- ///////////////////////// 2bit //////////////////////////
- #define b2_load_1row(col, src, dst) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b2_q7_q15_offset_ele(src, dst) block_cnt--; \
- }
- #define b2_load_2row(col, src, src2, dst, dst2) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) block_cnt--; \
- }
- #define b2_load_3row(col, src, src2, src3, dst, dst2, dst3) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) block_cnt--; \
- }
- #define b2_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) \
- b2_q7_q15_offset_ele(src4, dst4) block_cnt--; \
- }
- #define b2_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) \
- b2_q7_q15_offset_ele(src4, dst4) b2_q7_q15_offset_ele(src5, dst5) block_cnt--; \
- }
- #define b4_load_1row_1col(src, dst) b4_load_1row(1, src, dst)
- #define b4_load_1row_2col(src, dst) b4_load_1row(2, src, dst)
- #define b4_load_1row_3col(src, dst) b4_load_1row(3, src, dst)
- #define b4_load_1row_4col(src, dst) b4_load_1row(4, src, dst)
- #define b4_load_2row_1col(src, src2, dst, dst2) b4_load_2row(1, src, src2, dst, dst2)
- #define b4_load_2row_2col(src, src2, dst, dst2) b4_load_2row(2, src, src2, dst, dst2)
- #define b4_load_2row_3col(src, src2, dst, dst2) b4_load_2row(3, src, src2, dst, dst2)
- #define b4_load_2row_4col(src, src2, dst, dst2) b4_load_2row(4, src, src2, dst, dst2)
- #define b4_load_3row_1col(src, src2, src3, dst, dst2, dst3) b4_load_3row(1, src, src2, src3, dst, dst2, dst3)
- #define b4_load_3row_2col(src, src2, src3, dst, dst2, dst3) b4_load_3row(2, src, src2, src3, dst, dst2, dst3)
- #define b4_load_3row_3col(src, src2, src3, dst, dst2, dst3) b4_load_3row(3, src, src2, src3, dst, dst2, dst3)
- #define b4_load_3row_4col(src, src2, src3, dst, dst2, dst3) b4_load_3row(4, src, src2, src3, dst, dst2, dst3)
- #define b4_load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b4_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b4_load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b4_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b4_load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b4_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b4_load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b4_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b4_load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b4_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b4_load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b4_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b4_load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b4_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b4_load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b4_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b2_load_1row_1col(src, dst) b2_load_1row(1, src, dst)
- #define b2_load_1row_2col(src, dst) b2_load_1row(2, src, dst)
- #define b2_load_1row_3col(src, dst) b2_load_1row(3, src, dst)
- #define b2_load_1row_4col(src, dst) b2_load_1row(4, src, dst)
- #define b2_load_2row_1col(src, src2, dst, dst2) b2_load_2row(1, src, src2, dst, dst2)
- #define b2_load_2row_2col(src, src2, dst, dst2) b2_load_2row(2, src, src2, dst, dst2)
- #define b2_load_2row_3col(src, src2, dst, dst2) b2_load_2row(3, src, src2, dst, dst2)
- #define b2_load_2row_4col(src, src2, dst, dst2) b2_load_2row(4, src, src2, dst, dst2)
- #define b2_load_3row_1col(src, src2, src3, dst, dst2, dst3) b2_load_3row(1, src, src2, src3, dst, dst2, dst3)
- #define b2_load_3row_2col(src, src2, src3, dst, dst2, dst3) b2_load_3row(2, src, src2, src3, dst, dst2, dst3)
- #define b2_load_3row_3col(src, src2, src3, dst, dst2, dst3) b2_load_3row(3, src, src2, src3, dst, dst2, dst3)
- #define b2_load_3row_4col(src, src2, src3, dst, dst2, dst3) b2_load_3row(4, src, src2, src3, dst, dst2, dst3)
- #define b2_load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b2_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b2_load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b2_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b2_load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b2_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b2_load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- b2_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define b2_load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b2_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b2_load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b2_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b2_load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b2_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define b2_load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- b2_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define load_1row_1col(src, dst) basic_load_1row(1, src, dst)
- #define load_1row_2col(src, dst) basic_load_1row(2, src, dst)
- #define load_1row_3col(src, dst) basic_load_1row(3, src, dst)
- #define load_1row_4col(src, dst) basic_load_1row(4, src, dst)
- #define load_2row_1col(src, src2, dst, dst2) basic_load_2row(1, src, src2, dst, dst2)
- #define load_2row_2col(src, src2, dst, dst2) basic_load_2row(2, src, src2, dst, dst2)
- #define load_2row_3col(src, src2, dst, dst2) basic_load_2row(3, src, src2, dst, dst2)
- #define load_2row_4col(src, src2, dst, dst2) basic_load_2row(4, src, src2, dst, dst2)
- #define load_3row_1col(src, src2, src3, dst, dst2, dst3) basic_load_3row(1, src, src2, src3, dst, dst2, dst3)
- #define load_3row_2col(src, src2, src3, dst, dst2, dst3) basic_load_3row(2, src, src2, src3, dst, dst2, dst3)
- #define load_3row_3col(src, src2, src3, dst, dst2, dst3) basic_load_3row(3, src, src2, src3, dst, dst2, dst3)
- #define load_3row_4col(src, src2, src3, dst, dst2, dst3) basic_load_3row(4, src, src2, src3, dst, dst2, dst3)
- #define load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- basic_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- basic_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- basic_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
- basic_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4)
- #define load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- basic_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- basic_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- basic_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- #define load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
- basic_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
- /* ---------------------------------- Reuse ---------------------------------- */
- #define basic_reuse_1row(col, src_31, dst_31) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_assign2(src_31, dst_31) block_cnt--; \
- }
- #define basic_reuse_2row(col, src_31, src2_31, dst_31, dst2_31) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) block_cnt--; \
- }
- #define basic_reuse_3row(col, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) block_cnt--; \
- }
- #define basic_reuse_4row(col, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) \
- q31_assign2(src4_31, dst4_31) block_cnt--; \
- }
- #define basic_reuse_5row(col, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
- block_cnt = channel_div4 * col; \
- while (block_cnt > 0) { \
- q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) \
- q31_assign2(src4_31, dst4_31) q31_assign2(src5_31, dst5_31) block_cnt--; \
- }
- #define reuse_1row_1col(src_31, dst_31) basic_reuse_1row(1, src_31, dst_31)
- #define reuse_1row_2col(src_31, dst_31) basic_reuse_1row(2, src_31, dst_31)
- #define reuse_1row_3col(src_31, dst_31) basic_reuse_1row(3, src_31, dst_31)
- #define reuse_1row_4col(src_31, dst_31) basic_reuse_1row(4, src_31, dst_31)
- #define reuse_1row_5col(src_31, dst_31) basic_reuse_1row(5, src_31, dst_31)
- #define reuse_1row_6col(src_31, dst_31) basic_reuse_1row(6, src_31, dst_31)
- #define reuse_2row_1col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(1, src_31, src2_31, dst_31, dst2_31)
- #define reuse_2row_2col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(2, src_31, src2_31, dst_31, dst2_31)
- #define reuse_2row_3col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(3, src_31, src2_31, dst_31, dst2_31)
- #define reuse_2row_4col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(4, src_31, src2_31, dst_31, dst2_31)
- #define reuse_2row_5col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(5, src_31, src2_31, dst_31, dst2_31)
- #define reuse_2row_6col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(6, src_31, src2_31, dst_31, dst2_31)
- #define reuse_3row_1col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- basic_reuse_3row(1, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
- #define reuse_3row_2col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- basic_reuse_3row(2, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
- #define reuse_3row_3col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- basic_reuse_3row(3, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
- #define reuse_3row_4col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- basic_reuse_3row(4, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
- #define reuse_3row_5col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- basic_reuse_3row(5, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
- #define reuse_3row_6col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
- basic_reuse_3row(6, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
- #define reuse_4row_3col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
- basic_reuse_4row(3, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
- #define reuse_4row_4col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
- basic_reuse_4row(4, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
- #define reuse_4row_5col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
- basic_reuse_4row(5, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
- #define reuse_4row_6col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
- basic_reuse_4row(6, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
- #define reuse_5row_3col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
- basic_reuse_5row(3, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
- #define reuse_5row_4col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
- basic_reuse_5row(4, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
- #define reuse_5row_5col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
- basic_reuse_5row(5, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
- #define reuse_5row_6col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
- basic_reuse_5row(6, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
- #endif /* ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ */
|