img2col_element.h 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. /* ----------------------------------------------------------------------
  2. * Project: TinyEngine
  3. * Title: img2col_element.h
  4. *
  5. * Reference papers:
  6. * - MCUNet: Tiny Deep Learning on IoT Device, NeurIPS 2020
  7. * - MCUNetV2: Memory-Efficient Patch-based Inference for Tiny Deep Learning, NeurIPS 2021
  8. * - MCUNetV3: On-Device Training Under 256KB Memory, NeurIPS 2022
  9. * Contact authors:
  10. * - Wei-Ming Chen, wmchen@mit.edu
  11. * - Wei-Chen Wang, wweichen@mit.edu
  12. * - Ji Lin, jilin@mit.edu
  13. * - Ligeng Zhu, ligeng@mit.edu
  14. * - Song Han, songhan@mit.edu
  15. *
  16. * Target ISA: ARMv7E-M
  17. * -------------------------------------------------------------------- */
  18. /*
  19. * Copyright (c) 2023 HPMicro
  20. *
  21. * SPDX-License-Identifier: BSD-3-Clause
  22. * Target ISA: RISCV D45
  23. *
  24. */
  25. #ifndef ARMNN_INCLUDE_IMG2COL_ELEMENT_H_
  26. #define ARMNN_INCLUDE_IMG2COL_ELEMENT_H_
  27. #include "hpm_math.h"
  28. #define b2_q7_q15_offset_ele(src, dst) \
  29. /* convert from q7 to q15 and then store the results in the destination buffer */ \
  30. /*in_q7x4 = b2_nn_read_q7x4_ia((const q7_t **)&src); \
  31. in_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
  32. in_q15x2_2 = __SXTB16(in_q7x4); */ \
  33. in_q15x2_1 = ((src[0] & 0x0C) >> 2) + ((src[0] & 0xC0) << 10); \
  34. in_q15x2_2 = (src[0] & 0x03) + ((src[0] & 0x30) << 12); \
  35. src += 1; \
  36. out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
  37. /* Maximum of 9 bits from the addition is expected */ \
  38. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  39. \
  40. out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
  41. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  42. \
  43. write_q15x2_ia(&dst, out_q15x2_1); \
  44. write_q15x2_ia(&dst, out_q15x2_2);
  45. #define b4_q7_q15_offset_ele(src, dst) \
  46. /* convert from q7 to q15 and then store the results in the destination buffer */ \
  47. /*in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \
  48. in_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
  49. in_q15x2_2 = __SXTB16(in_q7x4); */ \
  50. in_q15x2_1 = ((src[0] & 0xF0) >> 4) + ((src[1] & 0xF0) << 12); \
  51. in_q15x2_2 = (src[0] & 0x0F) + ((src[1] & 0x0F) << 16); \
  52. src += 2; \
  53. out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
  54. /* Maximum of 9 bits from the addition is expected */ \
  55. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  56. \
  57. out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
  58. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  59. \
  60. write_q15x2_ia(&dst, out_q15x2_1); \
  61. write_q15x2_ia(&dst, out_q15x2_2);
  62. #define q7_q15_offset_ele(src, dst) \
  63. /* convert from q7 to q15 and then store the results in the destination buffer */ \
  64. in_q7x4 = hpm_nn_read_q7x4_ia((const q7_t **)&src); \
  65. /* Extract and sign extend each of the four q7 values to q15 */ \
  66. in_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
  67. in_q15x2_2 = __SXTB16(in_q7x4); \
  68. \
  69. out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
  70. /* Maximum of 9 bits from the addition is expected */ \
  71. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  72. \
  73. out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
  74. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  75. \
  76. write_q15x2_ia(&dst, out_q15x2_1); \
  77. write_q15x2_ia(&dst, out_q15x2_2);
  78. #define q8_q15_offset_ele(src, dst) \
  79. /* convert from q8 to q15 and then store the results in the destination buffer */ \
  80. in_q7x4 = hpm_nn_read_q7x4_ia((const q8_t **)&src); \
  81. /* Extend each of the four q8 values to q15 */ \
  82. in_q15x2_1 = __UXTB16(__ROR(in_q7x4, 8)); \
  83. in_q15x2_2 = __UXTB16(in_q7x4); \
  84. \
  85. out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16); \
  86. /* Maximum of 9 bits from the addition is expected */ \
  87. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  88. \
  89. out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16); \
  90. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  91. \
  92. write_q15x2_ia(&dst, out_q15x2_1); \
  93. write_q15x2_ia(&dst, out_q15x2_2);
  94. #define b4_q15_offset_reordered_ele(src, dst) \
  95. /* convert from q7 to q15 and then store the results in the destination buffer */ \
  96. in_q7x4 = b4_nn_read_q7x4_ia((const q7_t **)&src); \
  97. \
  98. /* Extract and sign extend each of the four q7 values to q15 */ \
  99. out_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
  100. out_q15x2_2 = __SXTB16(in_q7x4); \
  101. \
  102. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  103. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  104. \
  105. write_q15x2_ia(&dst, out_q15x2_2); \
  106. write_q15x2_ia(&dst, out_q15x2_1);
  107. #define b2_q15_offset_reordered_ele(src, dst) \
  108. /* convert from q7 to q15 and then store the results in the destination buffer */ \
  109. in_q7x4 = b2_nn_read_q7x4_ia(&src); \
  110. \
  111. /* Extract and sign extend each of the four q7 values to q15 */ \
  112. out_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
  113. out_q15x2_2 = __SXTB16(in_q7x4); \
  114. \
  115. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  116. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  117. \
  118. write_q15x2_ia(&dst, out_q15x2_2); \
  119. write_q15x2_ia(&dst, out_q15x2_1);
  120. #define q7_q15_offset_reordered_ele(src, dst) \
  121. /* convert from q7 to q15 and then store the results in the destination buffer */ \
  122. in_q7x4 = hpm_nn_read_q7x4_ia((const q7_t **)&src); \
  123. \
  124. /* Extract and sign extend each of the four q7 values to q15 */ \
  125. out_q15x2_1 = __SXTB16_ROR(in_q7x4, 8); \
  126. out_q15x2_2 = __SXTB16(in_q7x4); \
  127. \
  128. out_q15x2_1 = __SADD16(out_q15x2_1, offset_q15x2); \
  129. out_q15x2_2 = __SADD16(out_q15x2_2, offset_q15x2); \
  130. \
  131. write_q15x2_ia(&dst, out_q15x2_2); \
  132. write_q15x2_ia(&dst, out_q15x2_1);
  133. #define q31_assign2(src, dst) \
  134. *dst++ = *src++; \
  135. *dst++ = *src++;
  136. #define q31_assign4(src, dst) q31_assign2(src, dst) q31_assign2(src, dst)
  137. #define q31_assign6(src, dst) q31_assign4(src, dst) q31_assign2(src, dst)
  138. #define q31_assign8(src, dst) q31_assign4(src, dst) q31_assign4(src, dst)
  139. #define q31_assign10(src, dst) q31_assign8(src, dst) q31_assign2(src, dst)
  140. #define q31_assign12(src, dst) q31_assign10(src, dst) q31_assign2(src, dst)
  141. #define q31_pad2(dst, padvalue) \
  142. *dst++ = padvalue; \
  143. *dst++ = padvalue;
  144. #define q31_pad4(dst, padvalue) q31_pad2(dst, padvalue) q31_pad2(dst, padvalue)
  145. #define q31_pad6(dst, padvalue) q31_pad4(dst, padvalue) q31_pad2(dst, padvalue)
  146. #define q31_pad10(dst, padvalue) q31_pad6(dst, padvalue) q31_pad4(dst, padvalue)
  147. #define q31_pad14(dst, padvalue) q31_pad6(dst, padvalue) q31_pad6(dst, padvalue) q31_pad2(dst, padvalue)
  148. #define assignq31toq15() \
  149. dst = (q15_t *)dst_31; \
  150. dst2 = (q15_t *)dst2_31; \
  151. dst3 = (q15_t *)dst3_31; \
  152. dst4 = (q15_t *)dst4_31; \
  153. dst5 = (q15_t *)dst5_31; \
  154. dst6 = (q15_t *)dst6_31; \
  155. dst7 = (q15_t *)dst7_31;
  156. #define assignq15toq31() \
  157. dst_31 = (q31_t *)dst; \
  158. dst2_31 = (q31_t *)dst2; \
  159. dst3_31 = (q31_t *)dst3; \
  160. dst4_31 = (q31_t *)dst4; \
  161. dst5_31 = (q31_t *)dst5; \
  162. dst6_31 = (q31_t *)dst6; \
  163. dst7_31 = (q31_t *)dst7;
  164. /* ---------------------------------- Pad ---------------------------------- */
  165. #define basic_pad_1row(col, dst_31, pad_out_q15x2) \
  166. block_cnt = channel_div4 * col; \
  167. while (block_cnt > 0) { \
  168. q31_pad2(dst_31, pad_out_q15x2) block_cnt--; \
  169. }
  170. #define basic_pad_2row(col, dst_31, dst2_31, pad_out_q15x2) \
  171. block_cnt = channel_div4 * col; \
  172. while (block_cnt > 0) { \
  173. q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) block_cnt--; \
  174. }
  175. #define basic_pad_3row(col, dst_31, dst2_31, dst3_31, pad_out_q15x2) \
  176. block_cnt = channel_div4 * col; \
  177. while (block_cnt > 0) { \
  178. q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) block_cnt--; \
  179. }
  180. #define basic_pad_4row(col, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
  181. block_cnt = channel_div4 * col; \
  182. while (block_cnt > 0) { \
  183. q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) \
  184. q31_pad2(dst4_31, pad_out_q15x2) block_cnt--; \
  185. }
  186. #define basic_pad_5row(col, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
  187. block_cnt = channel_div4 * col; \
  188. while (block_cnt > 0) { \
  189. q31_pad2(dst_31, pad_out_q15x2) q31_pad2(dst2_31, pad_out_q15x2) q31_pad2(dst3_31, pad_out_q15x2) \
  190. q31_pad2(dst4_31, pad_out_q15x2) q31_pad2(dst5_31, pad_out_q15x2) block_cnt--; \
  191. }
  192. #define pad_1row_1col(dst_31, pad_out_q15x2) basic_pad_1row(1, dst_31, pad_out_q15x2)
  193. #define pad_1row_2col(dst_31, pad_out_q15x2) basic_pad_1row(2, dst_31, pad_out_q15x2)
  194. #define pad_1row_3col(dst_31, pad_out_q15x2) basic_pad_1row(3, dst_31, pad_out_q15x2)
  195. #define pad_2row_1col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(1, dst_31, dst2_31, pad_out_q15x2)
  196. #define pad_2row_2col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(2, dst_31, dst2_31, pad_out_q15x2)
  197. #define pad_2row_3col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(3, dst_31, dst2_31, pad_out_q15x2)
  198. #define pad_2row_4col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(4, dst_31, dst2_31, pad_out_q15x2)
  199. #define pad_2row_5col(dst_31, dst2_31, pad_out_q15x2) basic_pad_2row(5, dst_31, dst2_31, pad_out_q15x2)
  200. #define pad_3row_1col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \
  201. basic_pad_3row(1, dst_31, dst2_31, dst3_31, pad_out_q15x2)
  202. #define pad_3row_2col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \
  203. basic_pad_3row(2, dst_31, dst2_31, dst3_31, pad_out_q15x2)
  204. #define pad_3row_3col(dst_31, dst2_31, dst3_31, pad_out_q15x2) \
  205. basic_pad_3row(3, dst_31, dst2_31, dst3_31, pad_out_q15x2)
  206. #define pad_4row_1col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
  207. basic_pad_4row(1, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2)
  208. #define pad_4row_2col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
  209. basic_pad_4row(2, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2)
  210. #define pad_4row_3col(dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2) \
  211. basic_pad_4row(3, dst_31, dst2_31, dst3_31, dst4_31, pad_out_q15x2)
  212. #define pad_5row_1col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
  213. basic_pad_5row(1, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2)
  214. #define pad_5row_2col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
  215. basic_pad_5row(2, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2)
  216. #define pad_5row_3col(dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2) \
  217. basic_pad_5row(3, dst_31, dst2_31, dst3_31, dst4_31, dst5_31, pad_out_q15x2)
  218. /* ---------------------------------- Load ---------------------------------- */
  219. #define basic_load_1row(col, src, dst) \
  220. block_cnt = channel_div4 * col; \
  221. while (block_cnt > 0) { \
  222. q7_q15_offset_ele(src, dst) block_cnt--; \
  223. }
  224. #define basic_load_2row(col, src, src2, dst, dst2) \
  225. block_cnt = channel_div4 * col; \
  226. while (block_cnt > 0) { \
  227. q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) block_cnt--; \
  228. }
  229. #define basic_load_3row(col, src, src2, src3, dst, dst2, dst3) \
  230. block_cnt = channel_div4 * col; \
  231. while (block_cnt > 0) { \
  232. q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) block_cnt--; \
  233. }
  234. #define basic_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \
  235. block_cnt = channel_div4 * col; \
  236. while (block_cnt > 0) { \
  237. q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) \
  238. q7_q15_offset_ele(src4, dst4) block_cnt--; \
  239. }
  240. #define basic_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  241. block_cnt = channel_div4 * col; \
  242. while (block_cnt > 0) { \
  243. q7_q15_offset_ele(src, dst) q7_q15_offset_ele(src2, dst2) q7_q15_offset_ele(src3, dst3) \
  244. q7_q15_offset_ele(src4, dst4) q7_q15_offset_ele(src5, dst5) block_cnt--; \
  245. }
  246. ///////////////////////// 4bit //////////////////////////
  247. #define b4_load_1row(col, src, dst) \
  248. block_cnt = channel_div4 * col; \
  249. while (block_cnt > 0) { \
  250. b4_q7_q15_offset_ele(src, dst) block_cnt--; \
  251. }
  252. #define b4_load_2row(col, src, src2, dst, dst2) \
  253. block_cnt = channel_div4 * col; \
  254. while (block_cnt > 0) { \
  255. b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) block_cnt--; \
  256. }
  257. #define b4_load_3row(col, src, src2, src3, dst, dst2, dst3) \
  258. block_cnt = channel_div4 * col; \
  259. while (block_cnt > 0) { \
  260. b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) block_cnt--; \
  261. }
  262. #define b4_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \
  263. block_cnt = channel_div4 * col; \
  264. while (block_cnt > 0) { \
  265. b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) \
  266. b4_q7_q15_offset_ele(src4, dst4) block_cnt--; \
  267. }
  268. #define b4_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  269. block_cnt = channel_div4 * col; \
  270. while (block_cnt > 0) { \
  271. b4_q7_q15_offset_ele(src, dst) b4_q7_q15_offset_ele(src2, dst2) b4_q7_q15_offset_ele(src3, dst3) \
  272. b4_q7_q15_offset_ele(src4, dst4) b4_q7_q15_offset_ele(src5, dst5) block_cnt--; \
  273. }
  274. ///////////////////////// 2bit //////////////////////////
  275. #define b2_load_1row(col, src, dst) \
  276. block_cnt = channel_div4 * col; \
  277. while (block_cnt > 0) { \
  278. b2_q7_q15_offset_ele(src, dst) block_cnt--; \
  279. }
  280. #define b2_load_2row(col, src, src2, dst, dst2) \
  281. block_cnt = channel_div4 * col; \
  282. while (block_cnt > 0) { \
  283. b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) block_cnt--; \
  284. }
  285. #define b2_load_3row(col, src, src2, src3, dst, dst2, dst3) \
  286. block_cnt = channel_div4 * col; \
  287. while (block_cnt > 0) { \
  288. b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) block_cnt--; \
  289. }
  290. #define b2_load_4row(col, src, src2, src3, src4, dst, dst2, dst3, dst4) \
  291. block_cnt = channel_div4 * col; \
  292. while (block_cnt > 0) { \
  293. b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) \
  294. b2_q7_q15_offset_ele(src4, dst4) block_cnt--; \
  295. }
  296. #define b2_load_5row(col, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  297. block_cnt = channel_div4 * col; \
  298. while (block_cnt > 0) { \
  299. b2_q7_q15_offset_ele(src, dst) b2_q7_q15_offset_ele(src2, dst2) b2_q7_q15_offset_ele(src3, dst3) \
  300. b2_q7_q15_offset_ele(src4, dst4) b2_q7_q15_offset_ele(src5, dst5) block_cnt--; \
  301. }
  302. #define b4_load_1row_1col(src, dst) b4_load_1row(1, src, dst)
  303. #define b4_load_1row_2col(src, dst) b4_load_1row(2, src, dst)
  304. #define b4_load_1row_3col(src, dst) b4_load_1row(3, src, dst)
  305. #define b4_load_1row_4col(src, dst) b4_load_1row(4, src, dst)
  306. #define b4_load_2row_1col(src, src2, dst, dst2) b4_load_2row(1, src, src2, dst, dst2)
  307. #define b4_load_2row_2col(src, src2, dst, dst2) b4_load_2row(2, src, src2, dst, dst2)
  308. #define b4_load_2row_3col(src, src2, dst, dst2) b4_load_2row(3, src, src2, dst, dst2)
  309. #define b4_load_2row_4col(src, src2, dst, dst2) b4_load_2row(4, src, src2, dst, dst2)
  310. #define b4_load_3row_1col(src, src2, src3, dst, dst2, dst3) b4_load_3row(1, src, src2, src3, dst, dst2, dst3)
  311. #define b4_load_3row_2col(src, src2, src3, dst, dst2, dst3) b4_load_3row(2, src, src2, src3, dst, dst2, dst3)
  312. #define b4_load_3row_3col(src, src2, src3, dst, dst2, dst3) b4_load_3row(3, src, src2, src3, dst, dst2, dst3)
  313. #define b4_load_3row_4col(src, src2, src3, dst, dst2, dst3) b4_load_3row(4, src, src2, src3, dst, dst2, dst3)
  314. #define b4_load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  315. b4_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4)
  316. #define b4_load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  317. b4_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4)
  318. #define b4_load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  319. b4_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4)
  320. #define b4_load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  321. b4_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4)
  322. #define b4_load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  323. b4_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  324. #define b4_load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  325. b4_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  326. #define b4_load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  327. b4_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  328. #define b4_load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  329. b4_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  330. #define b2_load_1row_1col(src, dst) b2_load_1row(1, src, dst)
  331. #define b2_load_1row_2col(src, dst) b2_load_1row(2, src, dst)
  332. #define b2_load_1row_3col(src, dst) b2_load_1row(3, src, dst)
  333. #define b2_load_1row_4col(src, dst) b2_load_1row(4, src, dst)
  334. #define b2_load_2row_1col(src, src2, dst, dst2) b2_load_2row(1, src, src2, dst, dst2)
  335. #define b2_load_2row_2col(src, src2, dst, dst2) b2_load_2row(2, src, src2, dst, dst2)
  336. #define b2_load_2row_3col(src, src2, dst, dst2) b2_load_2row(3, src, src2, dst, dst2)
  337. #define b2_load_2row_4col(src, src2, dst, dst2) b2_load_2row(4, src, src2, dst, dst2)
  338. #define b2_load_3row_1col(src, src2, src3, dst, dst2, dst3) b2_load_3row(1, src, src2, src3, dst, dst2, dst3)
  339. #define b2_load_3row_2col(src, src2, src3, dst, dst2, dst3) b2_load_3row(2, src, src2, src3, dst, dst2, dst3)
  340. #define b2_load_3row_3col(src, src2, src3, dst, dst2, dst3) b2_load_3row(3, src, src2, src3, dst, dst2, dst3)
  341. #define b2_load_3row_4col(src, src2, src3, dst, dst2, dst3) b2_load_3row(4, src, src2, src3, dst, dst2, dst3)
  342. #define b2_load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  343. b2_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4)
  344. #define b2_load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  345. b2_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4)
  346. #define b2_load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  347. b2_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4)
  348. #define b2_load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  349. b2_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4)
  350. #define b2_load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  351. b2_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  352. #define b2_load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  353. b2_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  354. #define b2_load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  355. b2_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  356. #define b2_load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  357. b2_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  358. #define load_1row_1col(src, dst) basic_load_1row(1, src, dst)
  359. #define load_1row_2col(src, dst) basic_load_1row(2, src, dst)
  360. #define load_1row_3col(src, dst) basic_load_1row(3, src, dst)
  361. #define load_1row_4col(src, dst) basic_load_1row(4, src, dst)
  362. #define load_2row_1col(src, src2, dst, dst2) basic_load_2row(1, src, src2, dst, dst2)
  363. #define load_2row_2col(src, src2, dst, dst2) basic_load_2row(2, src, src2, dst, dst2)
  364. #define load_2row_3col(src, src2, dst, dst2) basic_load_2row(3, src, src2, dst, dst2)
  365. #define load_2row_4col(src, src2, dst, dst2) basic_load_2row(4, src, src2, dst, dst2)
  366. #define load_3row_1col(src, src2, src3, dst, dst2, dst3) basic_load_3row(1, src, src2, src3, dst, dst2, dst3)
  367. #define load_3row_2col(src, src2, src3, dst, dst2, dst3) basic_load_3row(2, src, src2, src3, dst, dst2, dst3)
  368. #define load_3row_3col(src, src2, src3, dst, dst2, dst3) basic_load_3row(3, src, src2, src3, dst, dst2, dst3)
  369. #define load_3row_4col(src, src2, src3, dst, dst2, dst3) basic_load_3row(4, src, src2, src3, dst, dst2, dst3)
  370. #define load_4row_1col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  371. basic_load_4row(1, src, src2, src3, src4, dst, dst2, dst3, dst4)
  372. #define load_4row_2col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  373. basic_load_4row(2, src, src2, src3, src4, dst, dst2, dst3, dst4)
  374. #define load_4row_3col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  375. basic_load_4row(3, src, src2, src3, src4, dst, dst2, dst3, dst4)
  376. #define load_4row_4col(src, src2, src3, src4, dst, dst2, dst3, dst4) \
  377. basic_load_4row(4, src, src2, src3, src4, dst, dst2, dst3, dst4)
  378. #define load_5row_1col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  379. basic_load_5row(1, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  380. #define load_5row_2col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  381. basic_load_5row(2, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  382. #define load_5row_3col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  383. basic_load_5row(3, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  384. #define load_5row_4col(src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5) \
  385. basic_load_5row(4, src, src2, src3, src4, src5, dst, dst2, dst3, dst4, dst5)
  386. /* ---------------------------------- Reuse ---------------------------------- */
  387. #define basic_reuse_1row(col, src_31, dst_31) \
  388. block_cnt = channel_div4 * col; \
  389. while (block_cnt > 0) { \
  390. q31_assign2(src_31, dst_31) block_cnt--; \
  391. }
  392. #define basic_reuse_2row(col, src_31, src2_31, dst_31, dst2_31) \
  393. block_cnt = channel_div4 * col; \
  394. while (block_cnt > 0) { \
  395. q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) block_cnt--; \
  396. }
  397. #define basic_reuse_3row(col, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  398. block_cnt = channel_div4 * col; \
  399. while (block_cnt > 0) { \
  400. q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) block_cnt--; \
  401. }
  402. #define basic_reuse_4row(col, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
  403. block_cnt = channel_div4 * col; \
  404. while (block_cnt > 0) { \
  405. q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) \
  406. q31_assign2(src4_31, dst4_31) block_cnt--; \
  407. }
  408. #define basic_reuse_5row(col, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
  409. block_cnt = channel_div4 * col; \
  410. while (block_cnt > 0) { \
  411. q31_assign2(src_31, dst_31) q31_assign2(src2_31, dst2_31) q31_assign2(src3_31, dst3_31) \
  412. q31_assign2(src4_31, dst4_31) q31_assign2(src5_31, dst5_31) block_cnt--; \
  413. }
  414. #define reuse_1row_1col(src_31, dst_31) basic_reuse_1row(1, src_31, dst_31)
  415. #define reuse_1row_2col(src_31, dst_31) basic_reuse_1row(2, src_31, dst_31)
  416. #define reuse_1row_3col(src_31, dst_31) basic_reuse_1row(3, src_31, dst_31)
  417. #define reuse_1row_4col(src_31, dst_31) basic_reuse_1row(4, src_31, dst_31)
  418. #define reuse_1row_5col(src_31, dst_31) basic_reuse_1row(5, src_31, dst_31)
  419. #define reuse_1row_6col(src_31, dst_31) basic_reuse_1row(6, src_31, dst_31)
  420. #define reuse_2row_1col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(1, src_31, src2_31, dst_31, dst2_31)
  421. #define reuse_2row_2col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(2, src_31, src2_31, dst_31, dst2_31)
  422. #define reuse_2row_3col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(3, src_31, src2_31, dst_31, dst2_31)
  423. #define reuse_2row_4col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(4, src_31, src2_31, dst_31, dst2_31)
  424. #define reuse_2row_5col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(5, src_31, src2_31, dst_31, dst2_31)
  425. #define reuse_2row_6col(src_31, src2_31, dst_31, dst2_31) basic_reuse_2row(6, src_31, src2_31, dst_31, dst2_31)
  426. #define reuse_3row_1col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  427. basic_reuse_3row(1, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
  428. #define reuse_3row_2col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  429. basic_reuse_3row(2, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
  430. #define reuse_3row_3col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  431. basic_reuse_3row(3, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
  432. #define reuse_3row_4col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  433. basic_reuse_3row(4, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
  434. #define reuse_3row_5col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  435. basic_reuse_3row(5, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
  436. #define reuse_3row_6col(src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31) \
  437. basic_reuse_3row(6, src_31, src2_31, src3_31, dst_31, dst2_31, dst3_31)
  438. #define reuse_4row_3col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
  439. basic_reuse_4row(3, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
  440. #define reuse_4row_4col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
  441. basic_reuse_4row(4, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
  442. #define reuse_4row_5col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
  443. basic_reuse_4row(5, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
  444. #define reuse_4row_6col(src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31) \
  445. basic_reuse_4row(6, src_31, src2_31, src3_31, src4_31, dst_31, dst2_31, dst3_31, dst4_31)
  446. #define reuse_5row_3col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
  447. basic_reuse_5row(3, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
  448. #define reuse_5row_4col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
  449. basic_reuse_5row(4, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
  450. #define reuse_5row_5col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
  451. basic_reuse_5row(5, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
  452. #define reuse_5row_6col(src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31) \
  453. basic_reuse_5row(6, src_31, src2_31, src3_31, src4_31, src5_31, dst_31, dst2_31, dst3_31, dst4_31, dst5_31)
  454. #endif /* ARMNN_INCLUDE_IMG2COL_ELEMENT_H_ */