37 #include "../include/harris.hpp"
38 #include "../include/xf_harris_config.h"
40 #ifdef USE_HLSLIB_DATAFLOW
41 #include "../../../../../hlslib/include/hlslib/xilinx/Stream.h"
42 #include "../../../../../hlslib/include/hlslib/xilinx/Simulation.h"
45 #ifdef USE_HLSLIB_STREAM
56 #if TRANSFERS_PER_CHUNK_DIVEND == 0
57 #define TRANSFERS_PER_CHUNK_LAST_BURST TRANSFERS_PER_CHUNK
59 #define TRANSFERS_PER_CHUNK_LAST_BURST TRANSFERS_PER_CHUNK_DIVEND
66 stream<NodeId> &sDstNode_sig,
67 ap_uint<32> *po_rx_ports
71 #pragma HLS inline off
75 #pragma HLS reset variable=port_fsm
82 printf(
"DEBUG in pPortAndDestionation: port_fsm - FSM_WRITE_NEW_DATA\n");
84 if(!sDstNode_sig.full())
86 NodeId dst_rank = (*pi_rank + 1) % *pi_size;
87 printf(
"rank: %d; size: %d; \n", (
int) *pi_rank, (
int) *pi_size);
88 sDstNode_sig.write(dst_rank);
93 printf(
"DEBUG in pPortAndDestionation: port_fsm - FSM_DONE\n");
106 unsigned int *processed_word,
unsigned int *image_loaded)
110 img[*processed_word] = (ap_uint<INPUT_PTR_WIDTH>)
input;
111 printf(
"DEBUG in storeWordToArray: input = %u = 0x%16.16llX \n",
input,
input);
112 printf(
"DEBUG in storeWordToArray: img[%u]= %u = 0x%16.16llX \n", *processed_word,
113 (uint64_t)
img[*processed_word], (uint64_t)
img[*processed_word]);
118 printf(
"DEBUG in storeWordToArray: WARNING - you've reached the max depth of img[%u]. Will put *processed_word = 0.\n", *processed_word);
131 #ifdef USE_HLSLIB_STREAM
132 Stream<Data_t_in, MIN_RX_LOOPS> &img_in_axi_stream,
135 stream<ap_uint<INPUT_PTR_WIDTH>> &img_in_axi_stream,
137 unsigned int *processed_word_rx,
138 unsigned int *processed_bytes_rx,
139 stream<bool> &sImageLoaded
146 unsigned int bytes_with_keep = 0;
148 for (
unsigned int i=0; i<loop_cnt; i++) {
152 if ((word.
tkeep >> i) == 0) {
153 printf(
"WARNING: value with tkeep=0 at i=%u\n", i);
156 v.data = (ap_uint<INPUT_PTR_WIDTH>)(word.
tdata >> i*8);
160 img_in_axi_stream.write(v.data);
161 bytes_with_keep += bytes_per_loop;
172 (*processed_bytes_rx) += bytes_with_keep;
173 if (!sImageLoaded.full()) {
174 sImageLoaded.write(
false);
178 printf(
"DEBUG in storeWordToAxiStream: WARNING - you've reached the max depth of img. Will put *processed_bytes_rx = 0.\n");
179 *processed_bytes_rx = 0;
180 if (!sImageLoaded.full()) {
181 sImageLoaded.write(
true);
203 stream<NetworkWord> &siSHL_This_Data,
204 stream<NetworkMetaStream> &siNrc_meta,
205 stream<NetworkMetaStream> &sRxtoTx_Meta,
207 stream<DmCmd> &soMemWrCmdP0,
208 stream<DmSts> &siMemWrStsP0,
213 unsigned int *processed_bytes_rx,
214 stream<bool> &sImageLoaded
218 #pragma HLS INLINE off
219 #pragma HLS pipeline II=1
224 static ap_uint<MEMDW_512> v = 0;
230 static unsigned int cur_transfers_per_chunk;
231 static unsigned int cnt_wr_stream, cnt_wr_img_loaded;
232 static stream<ap_uint<MEMDW_512>> img_in_axi_stream (
"img_in_axi_stream");
234 #pragma HLS stream variable=img_in_axi_stream depth=img_in_axi_stream_depth
235 static unsigned int ddr_addr_in;
242 static DmSts memWrStsP0;
244 #pragma HLS reset variable=cur_transfers_per_chunk
245 #pragma HLS reset variable=cnt_wr_stream
246 #pragma HLS reset variable=cnt_wr_img_loaded
247 #pragma HLS reset variable=ddr_addr_in
248 #pragma HLS reset variable=patternWriteNum
249 #pragma HLS reset variable=timeoutCnt
250 #pragma HLS reset variable=memP0
251 #pragma HLS reset variable=memWrStsP0
256 printf(
"DEBUG in pRXPathDDR: enqueueFSM - WAIT_FOR_META, *processed_bytes_rx=%u\n",
257 *processed_bytes_rx);
265 if ( !siNrc_meta.empty() && !sRxtoTx_Meta.full() )
267 meta_tmp = siNrc_meta.read();
269 sRxtoTx_Meta.write(meta_tmp);
270 if ((*processed_bytes_rx) == 0) {
276 cur_transfers_per_chunk = 0;
294 printf(
"DEBUG in pRXPathDDR: enqueueFSM - PROCESSING_PACKET, *processed_bytes_rx=%u\n",
295 *processed_bytes_rx);
296 if ( !siSHL_This_Data.empty() )
299 netWord = siSHL_This_Data.read();
300 printf(
"DEBUG in pRXPathDDR: Data write = {D=0x%16.16llX, K=0x%2.2X, L=%d} \n",
301 netWord.
tdata.to_long(), netWord.
tkeep.to_int(), netWord.
tlast.to_int());
303 if ((netWord.
tkeep >> cnt_wr_stream) == 0) {
304 printf(
"WARNING: value with tkeep=0 at cnt_wr_stream=%u\n", cnt_wr_stream);
307 v(cnt_wr_stream*64, (cnt_wr_stream+1)*64-1) = netWord.
tdata(0,63);
308 if ((cnt_wr_stream++ == loop_cnt-1) || (netWord.
tlast == 1)) {
309 if ( !img_in_axi_stream.full() ) {
311 img_in_axi_stream.write(v);
351 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_CHK_PROC_BYTES, processed_bytes_rx=%u\n", *processed_bytes_rx);
352 if (*processed_bytes_rx <
IMGSIZE-bytes_per_loop) {
353 (*processed_bytes_rx) += bytes_per_loop;
356 printf(
"DEBUG in pRXPathDDR: WARNING - you've reached the max depth of img. Will put *processed_bytes_rx = 0.\n");
357 *processed_bytes_rx = 0;
384 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_WR_PAT_CMD\n");
385 if ( !soMemWrCmdP0.full() ) {
387 if (*processed_bytes_rx == 0){
402 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_WR_PAT_LOAD\n");
409 if(netWord.
tlast == 1) {
419 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_WR_PAT_DATA\n");
420 if (!soMemWriteP0.full()) {
422 if (!img_in_axi_stream.empty()) {
423 memP0.
tdata = img_in_axi_stream.read();
425 ap_uint<8> keepVal = 0xFF;
426 memP0.
tkeep = (ap_uint<64>) (keepVal, keepVal, keepVal, keepVal, keepVal, keepVal, keepVal, keepVal);
428 printf(
"DEBUG: (patternWriteNum == cur_transfers_per_chunk -1) \n");
430 cnt_wr_img_loaded = 0;
438 soMemWriteP0.write(memP0);
443 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_WR_PAT_STS_A\n");
444 if (!siMemWrStsP0.empty()) {
447 siMemWrStsP0.read(memWrStsP0);
463 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_WR_PAT_STS_B\n");
464 if ((memWrStsP0.
tag = 0x0) && (memWrStsP0.
okay == 1)) {
465 if ((*processed_bytes_rx) == 0) {
466 if (!sImageLoaded.full()) {
467 if (cnt_wr_img_loaded++ >= 1) {
468 sImageLoaded.write(
false);
472 sImageLoaded.write(
true);
486 printf(
"DEBUG in pRXPathDDR: enqueueFSM - FSM_WR_PAT_STS_C\n");
487 if(netWord.
tlast == 1) {
514 stream<NetworkWord> &siSHL_This_Data,
515 stream<NetworkMetaStream> &siNrc_meta,
516 stream<NetworkMetaStream> &sRxtoTx_Meta,
517 #ifdef USE_HLSLIB_STREAM
518 Stream<Data_t_in, MIN_RX_LOOPS> &img_in_axi_stream,
521 stream<ap_uint<INPUT_PTR_WIDTH>> &img_in_axi_stream,
524 unsigned int *processed_word_rx,
525 unsigned int *processed_bytes_rx,
526 stream<bool> &sImageLoaded
530 #pragma HLS INLINE off
531 #pragma HLS pipeline II=1
539 printf(
"DEBUG in pRXPath: enqueueFSM - WAIT_FOR_META, *processed_word_rx=%u, *processed_bytes_rx=%u\n",
540 *processed_word_rx, *processed_bytes_rx);
541 if ( !siNrc_meta.empty() && !sRxtoTx_Meta.full() )
543 meta_tmp = siNrc_meta.read();
545 sRxtoTx_Meta.write(meta_tmp);
551 printf(
"DEBUG in pRXPath: enqueueFSM - PROCESSING_PACKET, *processed_word_rx=%u, *processed_bytes_rx=%u\n",
552 *processed_word_rx, *processed_bytes_rx);
553 if ( !siSHL_This_Data.empty() && !img_in_axi_stream.full())
556 netWord = siSHL_This_Data.read();
559 if(netWord.
tlast == 1)
581 stream<NetworkWord> &sRxpToTxp_Data,
587 #ifdef USE_HLSLIB_STREAM
588 Stream<Data_t_in, MIN_RX_LOOPS> &img_in_axi_stream,
589 Stream<Data_t_out, MIN_TX_LOOPS> &img_out_axi_stream,
591 stream<ap_uint<INPUT_PTR_WIDTH>> &img_in_axi_stream,
592 stream<ap_uint<OUTPUT_PTR_WIDTH>> &img_out_axi_stream,
597 stream<bool> &sImageLoaded
601 #pragma HLS INLINE off
602 #pragma HLS pipeline II=1
606 uint16_t Thresh = 442;
608 uint16_t k = K * (1 << 16);
609 static bool accel_called;
610 static unsigned int processed_word_proc;
611 static unsigned int timeoutCntAbs;
612 static unsigned int cnt_i;
614 ap_uint<OUTPUT_PTR_WIDTH> raw64;
619 static unsigned int ddr_addr_out;
620 #pragma HLS reset variable=ddr_addr_out
623 #pragma HLS reset variable=accel_called
624 #pragma HLS reset variable=processed_word_proc
625 #pragma HLS reset variable=timeoutCntAbs
626 #pragma HLS reset variable=cnt_i
627 #pragma HLS reset variable=tmp
628 #pragma HLS reset variable=raw64
629 #pragma HLS reset variable=temp
634 printf(
"DEBUG in pProcPath: WAIT_FOR_META\n");
635 if (!sImageLoaded.empty())
637 if (sImageLoaded.read() ==
true) {
639 accel_called =
false;
640 processed_word_proc = 0;
651 printf(
"DEBUG in pProcPath: PROCESSING_PACKET\n");
653 if ( !img_in_axi_stream.empty() && !img_out_axi_stream.full() )
656 if (accel_called ==
false) {
676 printf(
"DEBUG in pProcPath: HARRIS_RETURN_RESULTS, ddr_addr_out=%u\n", ddr_addr_out);
677 if (accel_called ==
true) {
679 printf(
"DEBUG in pProcPath: Accumulated %u net words (%u B) to complete a single DDR word\n",
689 printf(
"DEBUG in pProcPath: HARRIS_RETURN_RESULTS_ABSORB_DDR_LAT [%u out of %u]\n", timeoutCntAbs,
DDR_LATENCY);
713 printf(
"DEBUG in pProcPath: HARRIS_RETURN_RESULTS_FWD\n");
728 newWord =
NetworkWord(temp.data, temp.keep, temp.last);
729 sRxpToTxp_Data.write(newWord);
740 printf(
"DEBUG in pProcPath: HARRIS_RETURN_RESULTS\n");
741 if ( !img_out_axi_stream.empty() && !sRxpToTxp_Data.full() )
744 temp.data = img_out_axi_stream.read();
745 if ( img_out_axi_stream.empty() )
750 accel_called =
false;
758 newWord =
NetworkWord(temp.data, temp.keep, temp.last);
759 sRxpToTxp_Data.write(newWord);
783 stream<NodeId> &sDstNode_sig,
784 stream<NetworkWord> &soTHIS_Shl_Data,
785 stream<NetworkMetaStream> &soNrc_meta,
786 stream<NetworkWord> &sRxpToTxp_Data,
787 stream<NetworkMetaStream> &sRxtoTx_Meta,
788 unsigned int *processed_word_tx,
789 ap_uint<32> *pi_rank,
794 #pragma HLS INLINE off
795 #pragma HLS pipeline II=1
805 #pragma HLS reset variable=dst_rank
806 #pragma HLS reset variable=netWordTx
812 if(!sDstNode_sig.empty())
814 dst_rank = sDstNode_sig.read();
821 printf(
"DEBUG in pTXPath: dequeueFSM=%d - WAIT_FOR_STREAM_PAIR, *processed_word_tx=%u\n",
825 *processed_word_tx = 0;
834 if (( !sRxpToTxp_Data.empty() && !sRxtoTx_Meta.empty()
835 && !soTHIS_Shl_Data.full() && !soNrc_meta.full() ))
837 netWordTx = sRxpToTxp_Data.read();
844 soTHIS_Shl_Data.write(netWordTx);
846 meta_in = sRxtoTx_Meta.read().tdata;
848 meta_out_stream.
tlast = 1;
849 meta_out_stream.
tkeep = 0xFF;
866 soNrc_meta.write(meta_out_stream);
868 (*processed_word_tx)++;
869 printf(
"DEBUGGGG: Checking netWordTx.tlast...\n");
870 if(netWordTx.
tlast != 1)
878 printf(
"DEBUG in pTXPath: dequeueFSM=%d - PROCESSING_PACKET, *processed_word_tx=%u\n",
880 if( !sRxpToTxp_Data.empty() && !soTHIS_Shl_Data.full())
883 netWordTx = sRxpToTxp_Data.read();
885 (*processed_word_tx)++;
888 if ((netWordTx.
tlast == 1) || (((*processed_word_tx)*8) %
PACK_SIZE == 0))
891 printf(
"DEBUGGGG: A netWordTx.tlast=1 ... sRxpToTxp_Data.empty()==%u \n", sRxpToTxp_Data.empty());
905 soTHIS_Shl_Data.write(netWordTx);
920 ap_uint<32> *pi_rank,
921 ap_uint<32> *pi_size,
925 stream<NetworkWord> &siSHL_This_Data,
926 stream<NetworkWord> &soTHIS_Shl_Data,
927 stream<NetworkMetaStream> &siNrc_meta,
928 stream<NetworkMetaStream> &soNrc_meta,
929 ap_uint<32> *po_rx_ports
941 stream<DmCmd> &soMemWrCmdP0,
942 stream<DmSts> &siMemWrStsP0,
959 #pragma HLS INTERFACE axis register both port=siSHL_This_Data
960 #pragma HLS INTERFACE axis register both port=soTHIS_Shl_Data
962 #pragma HLS INTERFACE axis register both port=siNrc_meta
963 #pragma HLS INTERFACE axis register both port=soNrc_meta
965 #pragma HLS INTERFACE ap_ovld register port=po_rx_ports name=poROL_NRC_Rx_ports
967 #if HLS_VERSION < 20211
968 #pragma HLS INTERFACE ap_stable register port=pi_rank name=piFMC_ROL_rank
969 #pragma HLS INTERFACE ap_stable register port=pi_size name=piFMC_ROL_size
970 #elif HLS_VERSION >= 20211
971 #pragma HLS stable variable=pi_rank
972 #pragma HLS stable variable=pi_size
974 printf(
"ERROR: Invalid HLS_VERSION=%s\n", HLS_VERSION);
989 #pragma HLS INTERFACE axis register both port=soMemWrCmdP0
990 #pragma HLS INTERFACE axis register both port=siMemWrStsP0
991 #pragma HLS INTERFACE axis register both port=soMemWriteP0
993 #if HLS_VERSION <= 20201
994 #pragma HLS DATA_PACK variable=soMemWrCmdP0 instance=soMemWrCmdP0
995 #pragma HLS DATA_PACK variable=siMemWrStsP0 instance=siMemWrStsP0
996 #elif HLS_VERSION >= 20211
997 #pragma HLS aggregate variable=soMemWrCmdP0 compact=bit
998 #pragma HLS aggregate variable=siMemWrStsP0 compact=bit
1000 printf(
"ERROR: Invalid HLS_VERSION=%s\n", HLS_VERSION);
1009 const unsigned int max_axi_rw_burst_length = 16;
1012 #pragma HLS INTERFACE m_axi depth=ddr_mem_depth port=lcl_mem0 bundle=moMEM_Mp1\
1013 max_read_burst_length=max_axi_rw_burst_length max_write_burst_length=max_axi_rw_burst_length offset=direct \
1014 num_read_outstanding=16 num_write_outstanding=16 latency=ddr_latency
1017 #pragma HLS INTERFACE m_axi depth=ddr_mem_depth port=lcl_mem1 bundle=moMEM_Mp1 \
1018 max_read_burst_length=max_axi_rw_burst_length max_write_burst_length=max_axi_rw_burst_length offset=direct \
1019 num_read_outstanding=16 num_write_outstanding=16 latency=ddr_latency
1023 #pragma HLS DATAFLOW
1027 static stream<NetworkWord> sRxpToTxp_Data(
"sRxpToTxP_Data");
1028 static stream<NetworkMetaStream> sRxtoTx_Meta(
"sRxtoTx_Meta");
1029 static unsigned int processed_word_rx;
1030 static unsigned int processed_bytes_rx;
1031 static unsigned int processed_word_tx = 0;
1032 static stream<bool> sImageLoaded(
"sImageLoaded");
1033 static bool skip_read;
1034 static bool write_chunk_to_ddr_pending;
1035 static bool ready_to_accept_new_data;
1036 static bool signal_init;
1041 #ifdef USE_HLSLIB_DATAFLOW
1042 static hlslib::Stream<Data_t_in, MIN_RX_LOOPS> img_in_axi_stream (
"img_in_axi_stream");
1043 static hlslib::Stream<Data_t_out, MIN_TX_LOOPS> img_out_axi_stream (
"img_out_axi_stream");
1045 static stream<ap_uint<INPUT_PTR_WIDTH>> img_in_axi_stream (
"img_in_axi_stream");
1046 static stream<ap_uint<OUTPUT_PTR_WIDTH>> img_out_axi_stream (
"img_out_axi_stream");
1050 static stream<NodeId> sDstNode_sig(
"sDstNode_sig");
1054 #pragma HLS stream variable=sRxtoTx_Meta depth=tot_transfers
1055 #pragma HLS reset variable=enqueueFSM
1056 #pragma HLS reset variable=dequeueFSM
1057 #pragma HLS reset variable=HarrisFSM
1058 #pragma HLS reset variable=processed_word_rx
1059 #pragma HLS reset variable=processed_word_tx
1060 #pragma HLS reset variable=processed_bytes_rx
1062 #pragma HLS stream variable=sImageLoaded depth=1
1063 #pragma HLS reset variable=skip_read
1064 #pragma HLS reset variable=write_chunk_to_ddr_pending
1066 #pragma HLS reset variable=ready_to_accept_new_data
1067 #pragma HLS reset variable=signal_init
1068 #pragma HLS STREAM variable=sDstNode_sig depth=1
1071 #pragma HLS stream variable=img_in_axi_stream depth=img_in_axi_stream_depth
1072 #pragma HLS stream variable=img_out_axi_stream depth=img_out_axi_stream_depth
1074 #pragma HLS reset variable=fsmStateDDR
1080 #ifdef USE_HLSLIB_DATAFLOW
1097 HLSLIB_DATAFLOW_INIT();
1099 HLSLIB_DATAFLOW_FUNCTION(
pRXPath,
1106 &processed_bytes_rx,
1123 HLSLIB_DATAFLOW_FUNCTION(
pTXPath,
1132 HLSLIB_DATAFLOW_FINALIZE();
1156 &processed_bytes_rx,
1169 &processed_bytes_rx,
ap_uint< 32 > patternWriteNum
void pPortAndDestionation(ap_uint< 32 > *pi_rank, ap_uint< 32 > *pi_size, stream< NodeId > &sDstNode_sig, ap_uint< 32 > *po_rx_ports)
#define TRANSFERS_PER_CHUNK_DIVEND
void harris(ap_uint< 32 > *pi_rank, ap_uint< 32 > *pi_size, stream< NetworkWord > &siSHL_This_Data, stream< NetworkWord > &soTHIS_Shl_Data, stream< NetworkMetaStream > &siNrc_meta, stream< NetworkMetaStream > &soNrc_meta, ap_uint< 32 > *po_rx_ports)
Main process of the Harris Application directives.
void storeWordToArray(uint64_t input, ap_uint< 8 > img[16 *16/((64/8))], unsigned int *processed_word, unsigned int *image_loaded)
Store a net word to local memory.
#define HARRIS_RETURN_RESULTS_ABSORB_DDR_LAT
void storeWordToAxiStream(NetworkWord word, stream< ap_uint< 8 >> &img_in_axi_stream, unsigned int *processed_word_rx, unsigned int *processed_bytes_rx, stream< bool > &sImageLoaded)
Store a net word to a local AXI stream.
#define TRANSFERS_PER_CHUNK
unsigned int sRxpToTxp_DataCounter
void pTXPath(stream< NodeId > &sDstNode_sig, stream< NetworkWord > &soTHIS_Shl_Data, stream< NetworkMetaStream > &soNrc_meta, stream< NetworkWord > &sRxpToTxp_Data, stream< NetworkMetaStream > &sRxtoTx_Meta, unsigned int *processed_word_tx, ap_uint< 32 > *pi_rank, ap_uint< 32 > *pi_size)
Transmit Path - From THIS to SHELL.
void pRXPath(stream< NetworkWord > &siSHL_This_Data, stream< NetworkMetaStream > &siNrc_meta, stream< NetworkMetaStream > &sRxtoTx_Meta, stream< ap_uint< 8 >> &img_in_axi_stream, NetworkMetaStream meta_tmp, unsigned int *processed_word_rx, unsigned int *processed_bytes_rx, stream< bool > &sImageLoaded)
Receive Path - From SHELL to THIS.
#define HARRIS_RETURN_RESULTS_FWD
#define HARRIS_RETURN_RESULTS
#define FSM_CHK_PROC_BYTES
void pProcPath(stream< NetworkWord > &sRxpToTxp_Data, stream< ap_uint< 8 >> &img_in_axi_stream, stream< ap_uint< 64 >> &img_out_axi_stream, stream< bool > &sImageLoaded)
Processing Path - Main processing FSM for Vitis kernels.
void cornerHarrisAccelMem(membus_t *img_inp, membus_t *img_out, int rows, int cols, int threshold, int k)
Top-level accelerated function of the Harris Application with array I/F.
void fakeCornerHarrisAccelStream(hls::stream< ap_axiu< 8, 0, 0, 0 > > &img_in_axi_stream, hls::stream< ap_axiu< 64, 0, 0, 0 > > &img_out_axi_stream, unsigned int min_rx_loops, unsigned int min_tx_loops)
void cornerHarrisAccelStream(hls::stream< ap_uint< 8 >> &img_in_axi_stream, hls::stream< ap_uint< 64 >> &img_out_axi_stream, int rows, int cols, int threshold, int k)
Top-level accelerated function of the Harris Application with array I/F.
#define BITS_PER_10GBITETHRNET_AXI_PACKET
#define BYTES_PER_10GBITETHRNET_AXI_PACKET
#define CYCLES_UNTIL_TIMEOUT
#define FSM_WRITE_NEW_DATA
#define PROCESSING_PACKET
#define WAIT_FOR_STREAM_PAIR