1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file runtime.cc
22  * \brief Generic VTA runtime in C++11.
23  *
24  *  The runtime depends on specific instruction
25  *  stream spec as specified in hw_spec.h
26  */
27 #include <vta/driver.h>
28 #include <vta/hw_spec.h>
29 #include <vta/runtime.h>
30 #include <dmlc/logging.h>
31 #include <tvm/runtime/c_runtime_api.h>
32 
33 #include <cassert>
34 #include <cstring>
35 #include <vector>
36 #include <memory>
37 
38 namespace vta {
39 
40 // Avoid bad configurations.
41 static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
42               "VTA_UOP_WIDTH do not match VTAUop size");
43 
44 /*! \brief Enable coherent access of data buffers between VTA and CPU */
45 static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
46 /*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
47 static const bool kAlwaysCache = true;
48 
49 /*!
50  * \brief Data buffer represents data on CMA.
51  */
52 struct DataBuffer {
53   /*! \return Virtual address of the data. */
virt_addrvta::DataBuffer54   void* virt_addr() const {
55     return data_;
56   }
57   /*! \return Physical address of the data. */
phy_addrvta::DataBuffer58   vta_phy_addr_t phy_addr() const {
59     return phy_addr_;
60   }
61   /*!
62    * \brief Invalidate the cache of given location in data buffer.
63    * \param offset The offset to the data.
64    * \param size The size of the data.
65    */
InvalidateCachevta::DataBuffer66   void InvalidateCache(size_t offset, size_t size) {
67     if (!kBufferCoherent && kAlwaysCache) {
68       VTAInvalidateCache(reinterpret_cast<char *>(data_) + offset,
69                          phy_addr_ + offset,
70                          size);
71     }
72   }
73   /*!
74    * \brief Invalidate the cache of certain location in data buffer.
75    * \param offset The offset to the data.
76    * \param size The size of the data.
77    */
FlushCachevta::DataBuffer78   void FlushCache(size_t offset, size_t size) {
79     if (!kBufferCoherent && kAlwaysCache) {
80       VTAFlushCache(reinterpret_cast<char *>(data_) + offset,
81                     phy_addr_ + offset,
82                     size);
83     }
84   }
85   /*!
86    * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
87    * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
88    * \param src The source buffer in host memory.
89    * \param size Size of the region in Bytes.
90    */
MemCopyFromHostvta::DataBuffer91   void MemCopyFromHost(void* dst, const void* src, size_t size) {
92     VTAMemCopyFromHost(dst, src, size);
93   }
94   /*!
95    * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
96    * \param dst The desination buffer in host memory.
97    * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
98    * \param size Size of the region in Bytes.
99    */
MemCopyToHostvta::DataBuffer100   void MemCopyToHost(void* dst, const void* src, size_t size) {
101     VTAMemCopyToHost(dst, src, size);
102   }
103   /*!
104    * \brief Allocate a buffer of a given size.
105    * \param size The size of the buffer.
106    */
Allocvta::DataBuffer107   static DataBuffer* Alloc(size_t size) {
108     void* data = VTAMemAlloc(size, kAlwaysCache);
109     CHECK(data != nullptr);
110     DataBuffer* buffer = new DataBuffer();
111     buffer->data_ = data;
112     buffer->phy_addr_ = VTAMemGetPhyAddr(data);
113     return buffer;
114   }
115   /*!
116    * \brief Free the data buffer.
117    * \param buffer The buffer to be freed.
118    */
Freevta::DataBuffer119   static void Free(DataBuffer* buffer) {
120     VTAMemFree(buffer->data_);
121     delete buffer;
122   }
123   /*!
124    * \brief Create data buffer header from buffer ptr.
125    * \param buffer The buffer pointer.
126    * \return The corresponding data buffer header.
127    */
FromHandlevta::DataBuffer128   static DataBuffer* FromHandle(const void* buffer) {
129     return const_cast<DataBuffer*>(
130         reinterpret_cast<const DataBuffer*>(buffer));
131   }
132 
133  private:
134   /*! \brief The internal data. */
135   void* data_;
136   /*! \brief The physical address of the buffer, excluding header. */
137   vta_phy_addr_t phy_addr_;
138 };
139 
140 /*!
141  * \brief Micro op kernel.
142  *  Contains functions to construct the kernel with prefix Push.
143  */
144 class UopKernel {
145  public:
146   /*! \brief Loop information. */
147   struct LoopEntry {
148     uint32_t extent;
149     uint32_t dst_factor;
150     uint32_t src_factor;
151     uint32_t wgt_factor;
152   };
153   /*!
154    * \brief Construct UopKernel with signature.
155    * \param signature The pointer to signature.
156    * \param nbytes Number of bytes.
157    */
UopKernel(const char * signature,int nbytes)158   UopKernel(const char* signature, int nbytes)
159       : signature_(signature, signature + nbytes) {
160   }
161   /*!
162    * \brief Verify if the signature is correct.
163    * \param signature Signature ptr.
164    * \param nbytes Number of bytes.
165    */
MatchSignature(void * signature,int nbytes) const166   bool MatchSignature(void* signature, int nbytes) const {
167     if (static_cast<size_t>(nbytes) != signature_.size()) return false;
168     return memcmp(signature, signature_.data(), nbytes) == 0;
169   }
170   /*! \return Whether the kernel is cached in SRAM. */
cached() const171   bool cached() const {
172     return sram_begin_ != sram_end_;
173   }
174   /*! \return The length of the micro op sequence. */
size() const175   size_t size() const {
176     return seq_.size();
177   }
178   /*! \return The micro-op data. */
data() const179   const VTAUop* data() const {
180     return seq_.data();
181   }
182   /*! \return The loop structure. */
loop() const183   const std::vector<LoopEntry>& loop() const {
184     return loop_;
185   }
186   /*!
187    * \brief Declare loop start.
188    * \param extent The loop extent.
189    * \param dst_factor Loop factor of accum index.
190    * \param src_factor Loop factor of input index
191    * \param wgt_factor Loop factor of weight index.
192    */
PushLoopBegin(uint32_t extent,uint32_t dst_factor,uint32_t src_factor,uint32_t wgt_factor)193   void PushLoopBegin(uint32_t extent,
194                      uint32_t dst_factor,
195                      uint32_t src_factor,
196                      uint32_t wgt_factor) {
197     LoopEntry le;
198     le.extent = extent;
199     le.dst_factor = dst_factor;
200     le.src_factor = src_factor;
201     le.wgt_factor = wgt_factor;
202     CHECK_EQ(seq_.size(), 0U);
203     CHECK_LT(loop_.size(), 2U);
204     loop_.push_back(le);
205     ++loop_ptr_;
206   }
207   /*!
208    * \brief Declare loop end.
209    */
PushLoopEnd()210   void PushLoopEnd() {
211     --loop_ptr_;
212   }
213   /*!
214    * \brief Push micro op into kernel.
215    * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
216    * \param reset_out Resets the accum to 0.
217    * \param dst_index The accum memory index.
218    * \param src_index The input memory (gemm) / accum memory (alu) index.
219    * \param wgt_index The weight memory index.
220    * \param opcode The ALU opcode.
221    * \param use_imm Use immediate in ALU mode if set to true.
222    * \param imm_val Immediate value in ALU mode.
223    */
Push(uint32_t mode,uint32_t reset_out,uint32_t dst_index,uint32_t src_index,uint32_t wgt_index,uint32_t opcode,uint32_t use_imm,int32_t imm_val)224   void Push(uint32_t mode,
225             uint32_t reset_out,
226             uint32_t dst_index,
227             uint32_t src_index,
228             uint32_t wgt_index,
229             uint32_t opcode,
230             uint32_t use_imm,
231             int32_t imm_val) {
232     // The loop nest structure
233     VerifyDep(dst_index);
234     VTAUop op;
235     op.dst_idx = dst_index;
236     op.src_idx = src_index;
237     op.wgt_idx = wgt_index;
238     seq_.push_back(op);
239     // Ensure that mode is consistent if set
240     if (mode_ == 0xFFFFFFFF) {
241       mode_ = mode;
242     } else {
243       CHECK(mode_ == mode);
244     }
245     // Set reset_out field if unset
246     if (reset_out_ == 0xFFFFFFFF) {
247       reset_out_ = reset_out;
248     } else {
249       CHECK(reset_out_ == reset_out);
250     }
251     // Check kernel op and imm/imm_val in ALU mode
252     if (mode == 1) {
253       if (opcode_ == 0xFFFFFFFF) {
254         opcode_ = opcode;
255         use_imm_ = use_imm;
256         imm_val_ = imm_val;
257       } else {
258         CHECK(opcode_ == opcode);
259         CHECK(use_imm_ == use_imm);
260         CHECK(imm_val_ == imm_val);
261       }
262     }
263   }
264   /*! \brief Dump kernel micro ops to stdout. */
Dump()265   void Dump() {
266     uint32_t size = seq_.size();
267     printf("There are %u uops\n", size);
268     for (uint32_t i = 0; i < size; ++i) {
269       printf("[%04u]\t acc=%u, inp=%u, wgt=%u\n",
270              i,
271              seq_[i].dst_idx,
272              seq_[i].src_idx,
273              seq_[i].wgt_idx);
274     }
275     printf("\n");
276   }
277 
278  public:
279   // The kernel's mode, opcode, immediate setting and value
280   uint32_t mode_{0xFFFFFFFF};  // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
281   uint32_t opcode_{0xFFFFFFFF};
282   uint32_t reset_out_{0xFFFFFFFF};
283   bool use_imm_{false};
284   int16_t imm_val_{0};
285 
286  private:
287   // Verify that we don't write to the same acc_mem index two cycles in a row
VerifyDep(uint32_t dst_index)288   void VerifyDep(uint32_t dst_index) {
289     size_t step = std::min(static_cast<size_t>(2U), seq_.size());
290     for (size_t i = seq_.size() - step; i < seq_.size(); ++i) {
291       CHECK(seq_[i].dst_idx != dst_index);
292     }
293   }
294   // The uop buffer
295   template<int, bool, bool>
296   friend class UopQueue;
297   friend class CommandQueue;
298   // SRAM location if begin != end
299   uint32_t sram_begin_{0};
300   uint32_t sram_end_{0};
301   // The signature used for verification
302   std::vector<char> signature_;
303   // Internal sequence
304   std::vector<VTAUop> seq_;
305   // The loop nest structure specific to ALU instructions
306   std::vector<LoopEntry> loop_;
307   // The loop pointer
308   size_t loop_ptr_{0};
309 };
310 
311 /*!
312  * \brief Base class of all queues to send and recv serial data.
313  */
314 template <class T>
315 class BaseQueue {
316  public:
~BaseQueue()317   ~BaseQueue() {
318     if (fpga_buff_ != nullptr) {
319       VTAMemFree(fpga_buff_);
320     }
321   }
322   /*! \return Content of DRAM buffer. */
dram_buffer() const323   char* dram_buffer() const {
324     return dram_buffer_;
325   }
326   /*! \return Physical address of DRAM. */
dram_phy_addr() const327   vta_phy_addr_t dram_phy_addr() const {
328     CHECK(fpga_buff_phy_);
329     return fpga_buff_phy_;
330   }
331   /*! \return Whether there is pending information. */
pending() const332   bool pending() const {
333     return sram_begin_ != sram_end_;
334   }
335   /*! \brief Initialize the space of the buffer. */
InitSpace(uint32_t elem_bytes,uint32_t max_bytes,bool coherent,bool always_cache)336   void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) {
337     coherent_ = coherent;
338     always_cache_ = always_cache;
339     elem_bytes_ = elem_bytes;
340     // Allocate buffer ahead of time
341     fpga_buff_ = static_cast<char*>(VTAMemAlloc(
342         max_bytes, coherent_ || always_cache_));
343     CHECK(fpga_buff_ != nullptr);
344     fpga_buff_phy_ = VTAMemGetPhyAddr(fpga_buff_);
345   }
346   /*!
347    * \brief Reset the pointer of the buffer.
348    *  Set SRAM pointer to be the current end.
349    */
Reset()350   virtual void Reset() {
351     dram_buffer_.clear();
352     sram_begin_ = sram_end_;
353   }
354 
355  protected:
356   // Cache coherence access (shared memory only)
357   bool coherent_{false};
358   // Make the buffer cacheable
359   bool always_cache_{false};
360   // Element bytes
361   uint32_t elem_bytes_{0};
362   // Begin location of current SRAM read in FIFO mode
363   uint32_t sram_begin_{0};
364   // End location of current SRAM write in FIFO mode
365   uint32_t sram_end_{0};
366   // The buffer in DRAM
367   std::vector<T> dram_buffer_;
368   // FPGA accessible buffer
369   void* fpga_buff_{NULL};
370   // Physical address of the FPGA buffer
371   vta_phy_addr_t fpga_buff_phy_{0};
372 };
373 
374 /*!
375  * \brief Micro op buffer that manages the micro op cache.
376  */
377 template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
378 class UopQueue : public BaseQueue<VTAUop> {
379  public:
InitSpace()380   void InitSpace() {
381     BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
382   }
383   // Push data to the queue
384   template<typename FAutoSync>
Push(UopKernel * kernel,FAutoSync fautosync)385   void Push(UopKernel* kernel, FAutoSync fautosync) {
386     // if the micro-op is cached in VTA SRAM, skip
387     if (kernel->cached()) return;
388     // check if we've exceeded the size of the allocated FPGA readable buffer
389     size_t num_op = kernel->size();
390     if (dram_buffer_.size() + num_op > kMaxElems) {
391       fautosync();
392       CHECK(dram_buffer_.size() <= kMaxElems);
393     }
394     // Cannot have a micro-op kernel larger than SRAM buffer
395     CHECK(num_op <= kMaxNumUop);
396     uint32_t uop_begin = 0;
397     if (sram_end_ + num_op > kMaxNumUop) {
398       // Need to evict
399       cache_idx_ = 0;
400       sram_begin_ = 0;
401       sram_end_ = num_op;
402     } else {
403       uop_begin = sram_end_;
404       sram_end_ += num_op;
405     }
406     // Simple eviction policy
407     uint32_t evict_begin = cache_idx_;
408     for (; cache_idx_ < cache_.size(); ++cache_idx_) {
409       if (cache_[cache_idx_]->sram_begin_ >= sram_end_) break;
410       // Mark the kernel as "invalid"
411       cache_[cache_idx_]->sram_begin_ = 0;
412       cache_[cache_idx_]->sram_end_ = 0;
413     }
414     // Increase size of buffer
415     kernel->sram_begin_ = uop_begin;
416     kernel->sram_end_ = sram_end_;
417     CHECK(kernel->cached());
418     cache_.insert(cache_.begin() + cache_idx_, kernel);
419     cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_idx_);
420     cache_idx_ = evict_begin + 1;
421   }
422   // Flush micro op load instruction
FlushUopLoad(VTAMemInsn * insn)423   void FlushUopLoad(VTAMemInsn* insn) {
424     if (sram_begin_ != sram_end_) {
425       // Derive offset in FPGA-readable buffer
426       int32_t offset = 0;
427       for (uint32_t i = 0; i < cache_idx_ - 1; ++i) {
428         offset += cache_[i]->size() * kElemBytes;
429       }
430       insn->memory_type = VTA_MEM_ID_UOP;
431       insn->sram_base = sram_begin_;
432       // Update cache idx to physical address map
433       insn->dram_base = (fpga_buff_phy_ + offset) / kElemBytes;
434       insn->y_size = 1;
435       insn->x_size = (sram_end_ - sram_begin_);
436       insn->x_stride = (sram_end_ - sram_begin_);
437       insn->y_pad_0 = 0;
438       insn->y_pad_1 = 0;
439       insn->x_pad_0 = 0;
440       insn->x_pad_1 = 0;
441       // Reset indices
442       sram_begin_ = sram_end_;
443     }
444   }
445   /*! \brief clear cache and reset base queue buffer.*/
Reset()446   void Reset() {
447     cache_.clear();
448     cache_idx_ = 0;
449     BaseQueue<VTAUop>::Reset();
450   }
AutoReadBarrier()451   void AutoReadBarrier() {
452     ReadBarrier();
453   }
454   /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
ReadBarrier()455   void ReadBarrier() {
456     CHECK(fpga_buff_ != nullptr);
457     CHECK(fpga_buff_phy_);
458     // Iterate over caches; allocate buffer in FPGA-readable memory
459     uint32_t buff_size = 0;
460     for (uint32_t i = 0; i < cache_.size(); ++i) {
461       buff_size += cache_[i]->size() * kElemBytes;
462     }
463     CHECK(buff_size <= kMaxBytes);
464     // Move kernel contents to FPGA readable buffer
465     uint32_t offset = 0;
466     for (uint32_t i = 0; i < cache_.size(); ++i) {
467       uint32_t ksize = cache_[i]->size() * kElemBytes;
468       VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset,
469                          cache_[i]->data(),
470                          ksize);
471       // Update offset
472       offset += ksize;
473     }
474     // Flush if we're using a shared memory system
475     // and if interface is non-coherent
476     if (!coherent_ && always_cache_) {
477       VTAFlushCache(fpga_buff_,
478                     fpga_buff_phy_,
479                     offset);
480     }
481   }
482 
483  private:
484   // Cache pointer
485   uint32_t cache_idx_{0};
486   // Cached ring, sorted by sram_begin
487   std::vector<UopKernel*> cache_;
488   // Constants
489   static constexpr int kElemBytes = sizeof(VTAUop);
490   static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH;
491   static constexpr int kMaxElems = kMaxBytes / kElemBytes;
492 };
493 
494 // Internal kernel structure
495 class UopKernelMap {
496  public:
497   // Simple hash map
Get(void * signature,int nbytes)498   UopKernel** Get(void* signature,
499                   int nbytes) {
500     uint32_t key = 0;
501     CHECK(nbytes == 0 || nbytes == sizeof(int));
502     if (nbytes == sizeof(int)) {
503       memcpy(&key, signature, sizeof(int));
504       key = key + 1;
505     }
506     CHECK_LT(key, 100);
507     if (kmap_.size() <= key) {
508       kmap_.resize(key + 1, nullptr);
509     }
510     return &(kmap_[key]);
511   }
512 
513  private:
514   std::vector<UopKernel*> kmap_;
515 };
516 
517 enum PipelineStage : int {
518   kNoneStage = 0,
519   kLoadStage = 1,
520   kComputeStage = 2,
521   kStoreStage = 3
522 };
523 
524 // Instruction Queue
525 template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
526 class InsnQueue : public BaseQueue<VTAGenericInsn> {
527  public:
528   /*! \brief Initialize the space. */
InitSpace()529   void InitSpace() {
530     BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
531     // Initialize the stage
532     std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0);
533     std::fill(pending_pop_next_, pending_pop_next_ + 4, 0);
534   }
535   /*! \return The data pointer. */
data()536   VTAGenericInsn* data() {
537     return dram_buffer_.data();
538   }
539   /*! \return Number of instructions. */
count()540   uint32_t count() {
541     return dram_buffer_.size();
542   }
543   // Insert dependency push of load
DepPop(int from,int to)544   void DepPop(int from, int to) {
545     // NOTE: This instruction executes on queue[to]
546     if (from < to) {
547       if (pending_pop_prev_[to]) {
548         this->CommitPendingPop(to);
549       }
550       pending_pop_prev_[to] = 1;
551     } else {
552       if (pending_pop_next_[to]) {
553         this->CommitPendingPop(to);
554       }
555       pending_pop_next_[to] = 1;
556     }
557     // Impossible condition
558     CHECK(from != kLoadStage || to != kStoreStage);
559     CHECK(from != kStoreStage || to != kLoadStage);
560   }
561   // Insert dependency push of load
DepPush(int from,int to)562   void DepPush(int from, int to) {
563     // NOTE: this instruction executes on queue[from]
564     this->CommitPendingPop(from);
565     if (!dram_buffer_.empty()) {
566       VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(&dram_buffer_.back());
567       if (GetPipelineStage(mptr) == from) {
568         if (from < to && !mptr->push_next_dep) {
569           // push(LD->C) or push(C->ST)
570           mptr->push_next_dep = true; return;
571         } else if (from > to && !mptr->push_prev_dep) {
572           // push(C->LD) or push(ST->C)
573           mptr->push_prev_dep = true; return;
574         }
575       }
576     }
577     if (from < to) {
578       // Push next dep
579       PushNoop(from, false, true, false, false);
580     } else {
581       // Push prev dep
582       PushNoop(from, true, false, false, false);
583     }
584   }
585   // Create a new instruction for a GEMM stage
CreateGemInsn()586   VTAGemInsn* CreateGemInsn() {
587     return reinterpret_cast<VTAGemInsn*>(
588         Create(kComputeStage));
589   }
590   // Create a new instruction for a ALU stage
CreateAluInsn()591   VTAAluInsn* CreateAluInsn() {
592     return reinterpret_cast<VTAAluInsn*>(
593         Create(kComputeStage));
594   }
595   // Create a new instruction for a memory stage
CreateMemInsn(int memory_type)596   VTAMemInsn* CreateMemInsn(int memory_type) {
597     return reinterpret_cast<VTAMemInsn*>(
598         Create(GetMemPipelineStage(memory_type)));
599   }
600   // create a new instruction for a store stage
CreateStoreInsn()601   VTAMemInsn* CreateStoreInsn() {
602     return reinterpret_cast<VTAMemInsn*>(
603         Create(kStoreStage));
604   }
605   // Rewrite instruction stream to force serial execution
RewriteForceSerial()606   void RewriteForceSerial() {
607     int insn_count = count();
608     VTAMemInsn* mem_ptr = reinterpret_cast<VTAMemInsn*>(data());
609     VTAMemInsn* mem_last_store_ptr = nullptr;
610     VTAMemInsn* mem_last_ptr = nullptr;
611     for (int i = 1; i < insn_count; ++i) {
612       PipelineStage prev = GetPipelineStageAll(mem_ptr + i - 1);
613       PipelineStage now = GetPipelineStageAll(mem_ptr + i);
614       if (prev == kLoadStage && now == kComputeStage) {
615         mem_ptr[i - 1].push_prev_dep = false;
616         mem_ptr[i - 1].push_next_dep = true;
617         mem_ptr[i].pop_prev_dep = true;
618         mem_ptr[i].pop_next_dep = false;
619       } else if (prev == kComputeStage && now == kLoadStage) {
620         mem_ptr[i - 1].push_prev_dep = true;
621         mem_ptr[i - 1].push_next_dep = false;
622         mem_ptr[i].pop_prev_dep = false;
623         mem_ptr[i].pop_next_dep = true;
624       } else if (prev == kStoreStage && now == kComputeStage) {
625         mem_ptr[i - 1].push_prev_dep = true;
626         mem_ptr[i - 1].push_next_dep = false;
627         mem_ptr[i].pop_prev_dep = false;
628         mem_ptr[i].pop_next_dep = true;
629       } else if (prev == kComputeStage && now == kStoreStage) {
630         mem_ptr[i - 1].push_prev_dep = false;
631         mem_ptr[i - 1].push_next_dep = true;
632         mem_ptr[i].pop_prev_dep = true;
633         mem_ptr[i].pop_next_dep = false;
634       } else {
635         mem_ptr[i - 1].push_prev_dep = false;
636         mem_ptr[i - 1].push_next_dep = false;
637         mem_ptr[i].pop_prev_dep = false;
638         mem_ptr[i].pop_next_dep = false;
639       }
640       if (now == kStoreStage) {
641         mem_last_store_ptr = &mem_ptr[i];
642       }
643       mem_last_ptr = &mem_ptr[i];
644     }
645     // set dependency to make sure all core instruction get excuted
646     // before last FINISH instruction
647     if (mem_last_store_ptr && mem_last_ptr == mem_last_store_ptr) {
648       mem_last_store_ptr->push_prev_dep = true;
649       if (!pending_pop_next_[kComputeStage]) {
650         DepPop(kStoreStage, kComputeStage);
651       }
652       CommitPendingPop(kComputeStage);
653     } else {
654         pending_pop_next_[kComputeStage] = 0;
655     }
656     DepPush(kComputeStage, kLoadStage);
657     DepPop(kLoadStage, kComputeStage);
658     if (!pending_pop_next_[kLoadStage]) {
659       DepPop(kComputeStage, kLoadStage);
660     }
661     CommitPendingPop(kLoadStage);
662     DepPush(kLoadStage, kComputeStage);
663     CommitPendingPop(kComputeStage);
664   }
665   // Helper function: Get Opcode string
getOpcodeString(int opcode,bool use_imm)666   const char* getOpcodeString(int opcode, bool use_imm) {
667       // The string name
668       if (opcode == VTA_ALU_OPCODE_MIN) {
669           if (use_imm) {
670               return "min imm";
671           } else {
672               return "min";
673           }
674       } else if (opcode == VTA_ALU_OPCODE_MAX) {
675           if (use_imm) {
676               return "max imm";
677           } else {
678               return "max";
679           }
680       } else if (opcode == VTA_ALU_OPCODE_ADD) {
681           if (use_imm) {
682               return "add imm";
683           } else {
684               return "add";
685           }
686       } else if (opcode == VTA_ALU_OPCODE_SHR) {
687           return "shr";
688       }
689 
690       return "unknown op";
691   }
692   // Dump instructions in the queue
DumpInsn()693   void DumpInsn() {
694     // Keep tabs on dependence queues
695     int l2g_queue = 0;
696     int g2l_queue = 0;
697     int s2g_queue = 0;
698     int g2s_queue = 0;
699     // Converter
700     union VTAInsn c;
701     // Iterate over all instructions
702     int insn_count = count();
703     const VTAGenericInsn* insn = data();
704     printf("There are %u instructions\n", insn_count);
705     for (int i = 0; i < insn_count; ++i) {
706       // Fetch instruction and decode opcode
707       c.generic = insn[i];
708       printf("INSTRUCTION %u: ", i);
709       if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
710         if (c.mem.x_size == 0) {
711           if (c.mem.opcode == VTA_OPCODE_STORE) {
712             printf("NOP-STORE-STAGE\n");
713           } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
714             printf("NOP-COMPUTE-STAGE\n");
715           } else {
716             printf("NOP-MEMORY-STAGE\n");
717           }
718           printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
719                  static_cast<int>(c.mem.pop_prev_dep),
720                  static_cast<int>(c.mem.pop_next_dep),
721                  static_cast<int>(c.mem.push_prev_dep),
722                  static_cast<int>(c.mem.push_next_dep));
723           // Count status in queues
724           if (c.mem.opcode == VTA_OPCODE_STORE) {
725             CHECK(c.mem.pop_next_dep == false);
726             CHECK(c.mem.push_next_dep == false);
727             if (c.mem.pop_prev_dep) g2s_queue--;
728             if (c.mem.push_prev_dep) s2g_queue++;
729           } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
730                      (c.mem.memory_type == VTA_MEM_ID_INP ||
731                       c.mem.memory_type == VTA_MEM_ID_WGT) ) {
732             CHECK(c.mem.pop_prev_dep == false);
733             CHECK(c.mem.push_prev_dep == false);
734             if (c.mem.pop_next_dep) g2l_queue--;
735             if (c.mem.push_next_dep) l2g_queue++;
736           } else {
737             if (c.mem.pop_prev_dep) l2g_queue--;
738             if (c.mem.push_prev_dep) g2l_queue++;
739             if (c.mem.pop_next_dep) s2g_queue--;
740             if (c.mem.push_next_dep) g2s_queue++;
741           }
742           printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
743           printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
744           continue;
745         }
746         // Print instruction field information
747         if (c.mem.opcode == VTA_OPCODE_LOAD) {
748           printf("LOAD ");
749           if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
750           if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
751           if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
752           if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
753         }
754         if (c.mem.opcode == VTA_OPCODE_STORE) {
755           printf("STORE:\n");
756         }
757         printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
758                static_cast<int>(c.mem.pop_prev_dep),
759                static_cast<int>(c.mem.pop_next_dep),
760                static_cast<int>(c.mem.push_prev_dep),
761                static_cast<int>(c.mem.push_next_dep));
762         printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
763                static_cast<int>(c.mem.dram_base),
764                static_cast<int>(c.mem.sram_base));
765         printf("\ty: size=%d, pad=[%d, %d]\n",
766                static_cast<int>(c.mem.y_size),
767                static_cast<int>(c.mem.y_pad_0),
768                static_cast<int>(c.mem.y_pad_1));
769         printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
770                static_cast<int>(c.mem.x_size),
771                static_cast<int>(c.mem.x_stride),
772                static_cast<int>(c.mem.x_pad_0),
773                static_cast<int>(c.mem.x_pad_1));
774       } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
775         // Print instruction field information
776         printf("GEMM\n");
777 
778         printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
779                static_cast<int>(c.mem.pop_prev_dep),
780                static_cast<int>(c.mem.pop_next_dep),
781                static_cast<int>(c.mem.push_prev_dep),
782                static_cast<int>(c.mem.push_next_dep));
783         printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
784         printf("\trange (%d, %d)\n",
785                static_cast<int>(c.gemm.uop_bgn),
786                static_cast<int>(c.gemm.uop_end));
787         printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
788                static_cast<int>(c.gemm.iter_out),
789                static_cast<int>(c.gemm.wgt_factor_out),
790                static_cast<int>(c.gemm.src_factor_out),
791                static_cast<int>(c.gemm.dst_factor_out));
792         printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
793                static_cast<int>(c.gemm.iter_in),
794                static_cast<int>(c.gemm.wgt_factor_in),
795                static_cast<int>(c.gemm.src_factor_in),
796                static_cast<int>(c.gemm.dst_factor_in));
797       } else if (c.mem.opcode == VTA_OPCODE_ALU) {
798         // Print instruction field information
799         printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
800         printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
801                static_cast<int>(c.mem.pop_prev_dep),
802                static_cast<int>(c.mem.pop_next_dep),
803                static_cast<int>(c.mem.push_prev_dep),
804                static_cast<int>(c.mem.push_next_dep));
805         printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
806         printf("\trange (%d, %d)\n",
807                static_cast<int>(c.alu.uop_bgn),
808                static_cast<int>(c.alu.uop_end));
809         printf("\touter loop - iter: %d, dst: %d, src: %d\n",
810                static_cast<int>(c.alu.iter_out),
811                static_cast<int>(c.alu.dst_factor_out),
812                static_cast<int>(c.alu.src_factor_out));
813         printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
814                static_cast<int>(c.alu.iter_in),
815                static_cast<int>(c.alu.dst_factor_in),
816                static_cast<int>(c.alu.src_factor_in));
817       } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
818         printf("FINISH\n");
819       }
820 
821       // Count status in queues
822       if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
823         if (c.mem.opcode == VTA_OPCODE_STORE) {
824             CHECK(c.mem.pop_next_dep == false);
825             CHECK(c.mem.push_next_dep == false);
826             if (c.mem.pop_prev_dep) g2s_queue--;
827             if (c.mem.push_prev_dep) s2g_queue++;
828         } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
829                    (c.mem.memory_type == VTA_MEM_ID_INP ||
830                     c.mem.memory_type == VTA_MEM_ID_WGT) ) {
831             CHECK(c.mem.pop_prev_dep == false);
832             CHECK(c.mem.push_prev_dep == false);
833             if (c.mem.pop_next_dep) g2l_queue--;
834             if (c.mem.push_next_dep) l2g_queue++;
835         } else {
836             if (c.mem.pop_prev_dep) l2g_queue--;
837             if (c.mem.push_prev_dep) g2l_queue++;
838             if (c.mem.pop_next_dep) s2g_queue--;
839             if (c.mem.push_next_dep) g2s_queue++;
840         }
841       } else if (c.mem.opcode == VTA_OPCODE_GEMM ||
842                  c.mem.opcode == VTA_OPCODE_ALU) {
843         // Print instruction field information
844         if (c.gemm.pop_prev_dep) l2g_queue--;
845         if (c.gemm.push_prev_dep) g2l_queue++;
846         if (c.gemm.pop_next_dep) s2g_queue--;
847         if (c.gemm.push_next_dep) g2s_queue++;
848       }
849       printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
850       printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
851     }
852   }
853   // Commit all pending pop of corresponding stage
CommitPendingPop(int stage)854   void CommitPendingPop(int stage) {
855     // Handle the LD<->compute queue
856     // NOTE: pop executes on target(stage)
857     CHECK(stage > 0 && stage < 4);
858     if (pending_pop_prev_[stage] ||
859         pending_pop_next_[stage]) {
860       PushNoop(stage, false, false,
861                pending_pop_prev_[stage],
862                pending_pop_next_[stage]);
863       pending_pop_prev_[stage] = 0;
864       pending_pop_next_[stage] = 0;
865     }
866   }
CommitPending()867   void CommitPending() {
868     for (int i = kLoadStage; i <= kStoreStage; ++i) {
869       CommitPendingPop(i);
870     }
871   }
PendingPop()872   bool PendingPop() {
873     for (int i = kLoadStage; i <= kStoreStage; ++i) {
874       if (pending_pop_prev_[i]) return true;
875       if (pending_pop_next_[i]) return true;
876     }
877     return false;
878   }
AutoReadBarrier()879   void AutoReadBarrier() {
880     ReadBarrier();
881   }
882   /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
ReadBarrier()883   void ReadBarrier() {
884     CHECK(fpga_buff_ != nullptr);
885     CHECK(fpga_buff_phy_);
886     uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
887     CHECK(buff_size <= kMaxBytes);
888     // Copy contents of DRAM buffer to FPGA buff
889     VTAMemCopyFromHost(fpga_buff_,
890                        dram_buffer_.data(),
891                        buff_size);
892     // Flush if we're using a shared memory system
893     // and if interface is non-coherent
894     if (!coherent_ && always_cache_) {
895       VTAFlushCache(fpga_buff_,
896                     fpga_buff_phy_,
897                     buff_size);
898     }
899   }
900 
901  protected:
902   /*! \return Add new instruction to the buffer. */
NextInsn()903   VTAGenericInsn* NextInsn() {
904     VTAGenericInsn insn;
905     dram_buffer_.push_back(insn);
906     return &dram_buffer_.back();
907   }
908   // Create a new instruction for a given stage
Create(PipelineStage stage)909   VTAGenericInsn* Create(PipelineStage stage) {
910     VTAGenericInsn* gptr = NextInsn();
911     VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(gptr);
912     mptr->pop_prev_dep = pending_pop_prev_[stage];
913     mptr->pop_next_dep = pending_pop_next_[stage];
914     mptr->push_prev_dep = false;
915     mptr->push_next_dep = false;
916     pending_pop_prev_[stage] = 0;
917     pending_pop_next_[stage] = 0;
918     return gptr;
919   }
920   // Get stage of the memory
GetMemPipelineStage(int memory_type)921   static PipelineStage GetMemPipelineStage(int memory_type) {
922     if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
923     if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
924     return kLoadStage;
925   }
926   // Get stage of the computation
GetPipelineStage(VTAMemInsn * insn)927   static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
928     if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage;
929     if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
930     if (insn->opcode == VTA_OPCODE_LOAD) {
931       if (insn->x_size == 0) return kNoneStage;
932       if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
933       if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
934       return kLoadStage;
935     }
936     if (insn->opcode == VTA_OPCODE_STORE) {
937       // FIXME: Right now memory_type is a 2-bit field which means that
938       //        VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
939       //        checking the memory_type to avoid an CHECK error...
940       return kStoreStage;
941     }
942     LOG(FATAL) << "not reached";
943     return kNoneStage;
944   }
945 
946   // Get stage of memory and computation
GetPipelineStageAll(VTAMemInsn * insn)947   static PipelineStage GetPipelineStageAll(VTAMemInsn* insn) {
948       PipelineStage stage = GetPipelineStage(insn);
949       if (stage != kNoneStage) return stage;
950       return GetMemPipelineStage(insn->memory_type);
951   }
952 
953   // Push no-op
PushNoop(int stage,bool push_prev_dep,bool push_next_dep,bool pop_prev_dep,bool pop_next_dep)954   void PushNoop(int stage,
955                 bool push_prev_dep, bool push_next_dep,
956                 bool pop_prev_dep, bool pop_next_dep) {
957     VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
958     insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD);
959     insn->push_prev_dep = push_prev_dep;
960     insn->push_next_dep = push_next_dep;
961     insn->pop_prev_dep = pop_prev_dep;
962     insn->pop_next_dep = pop_next_dep;
963     insn->sram_base = 0;
964     insn->dram_base = 0;
965     insn->y_size = 0;
966     insn->x_size = 0;
967     insn->x_stride = 0;
968     insn->y_pad_0 = 0;
969     insn->y_pad_1 = 0;
970     insn->x_pad_0 = 0;
971     insn->x_pad_1 = 0;
972     insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP);
973   }
974 
975  private:
976   // Pending pop of each isntruction queue, qid=0 is not used
977   int pending_pop_prev_[4];
978   int pending_pop_next_[4];
979   static constexpr int kElemBytes = sizeof(VTAGenericInsn);
980   static constexpr int kMaxElems = kMaxBytes / kElemBytes;
981 };
982 
983 /*!
984  * \brief The command queue object that handles the request.
985  */
986 class CommandQueue {
987  public:
CommandQueue()988   CommandQueue() {
989     this->InitSpace();
990   }
InitSpace()991   void InitSpace() {
992     uop_queue_.InitSpace();
993     insn_queue_.InitSpace();
994     device_ = VTADeviceAlloc();
995     CHECK(device_ != nullptr);
996   }
997 
~CommandQueue()998   ~CommandQueue() {
999     VTADeviceFree(device_);
1000   }
1001 
GetElemBytes(uint32_t memory_id)1002   uint32_t GetElemBytes(uint32_t memory_id) {
1003     uint32_t elem_bytes = 0;
1004     switch (memory_id) {
1005       case VTA_MEM_ID_UOP:
1006           elem_bytes = VTA_UOP_ELEM_BYTES;
1007           break;
1008       case VTA_MEM_ID_INP:
1009           elem_bytes = VTA_INP_ELEM_BYTES;
1010           break;
1011       case VTA_MEM_ID_WGT:
1012           elem_bytes = VTA_WGT_ELEM_BYTES;
1013           break;
1014       case VTA_MEM_ID_ACC:
1015           elem_bytes = VTA_ACC_ELEM_BYTES;
1016           break;
1017       case VTA_MEM_ID_OUT:
1018           elem_bytes = VTA_OUT_ELEM_BYTES;
1019           break;
1020       default:
1021           LOG(FATAL) << "Memory id not recognized:" << memory_id;
1022           break;
1023     }
1024     /*
1025      * elements size should not larger than VTA_PAGE_BYTES.
1026      *
1027      */
1028     CHECK_GE(VTA_PAGE_BYTES, elem_bytes);
1029     return elem_bytes;
1030   }
1031 
LoadBuffer2D(void * src_dram_addr,uint32_t src_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride,uint32_t x_pad_before,uint32_t y_pad_before,uint32_t x_pad_after,uint32_t y_pad_after,uint32_t dst_sram_index,uint32_t dst_memory_type)1032   void LoadBuffer2D(void* src_dram_addr,
1033                     uint32_t src_elem_offset,
1034                     uint32_t x_size,
1035                     uint32_t y_size,
1036                     uint32_t x_stride,
1037                     uint32_t x_pad_before,
1038                     uint32_t y_pad_before,
1039                     uint32_t x_pad_after,
1040                     uint32_t y_pad_after,
1041                     uint32_t dst_sram_index,
1042                     uint32_t dst_memory_type) {
1043     VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
1044     insn->opcode = VTA_OPCODE_LOAD;
1045     insn->memory_type = dst_memory_type;
1046     insn->sram_base = dst_sram_index;
1047     DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
1048     insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
1049     insn->y_size = y_size;
1050     insn->x_size = x_size;
1051     insn->x_stride = x_stride;
1052     insn->y_pad_0 = y_pad_before;
1053     insn->y_pad_1 = y_pad_after;
1054     insn->x_pad_0 = x_pad_before;
1055     insn->x_pad_1 = x_pad_after;
1056     this->CheckInsnOverFlow();
1057   }
1058 
StoreBuffer2D(uint32_t src_sram_index,uint32_t src_memory_type,void * dst_dram_addr,uint32_t dst_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride)1059   void StoreBuffer2D(uint32_t src_sram_index,
1060                      uint32_t src_memory_type,
1061                      void* dst_dram_addr,
1062                      uint32_t dst_elem_offset,
1063                      uint32_t x_size,
1064                      uint32_t y_size,
1065                      uint32_t x_stride) {
1066     VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
1067     insn->opcode = VTA_OPCODE_STORE;
1068     insn->memory_type = src_memory_type;
1069     insn->sram_base = src_sram_index;
1070     DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
1071     insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
1072     insn->y_size = y_size;
1073     insn->x_size = x_size;
1074     insn->x_stride = x_stride;
1075     insn->y_pad_0 = 0;
1076     insn->y_pad_1 = 0;
1077     insn->x_pad_0 = 0;
1078     insn->x_pad_1 = 0;
1079     this->CheckInsnOverFlow();
1080   }
1081 
DepPush(int from_qid,int to_qid)1082   void DepPush(int from_qid, int to_qid) {
1083     insn_queue_.DepPush(from_qid, to_qid);
1084   }
1085 
DepPop(int from_qid,int to_qid)1086   void DepPop(int from_qid, int to_qid) {
1087     insn_queue_.DepPop(from_qid, to_qid);
1088   }
1089 
ReadBarrier(void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1090   void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
1091     if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) {
1092       uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
1093       DataBuffer::FromHandle(buffer)->FlushCache(
1094           elem_bytes * start, elem_bytes * extent);
1095     }
1096   }
1097 
WriteBarrier(void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1098   void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
1099     if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) {
1100       uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
1101       DataBuffer::FromHandle(buffer)->InvalidateCache(
1102           elem_bytes * start, elem_bytes * extent);
1103     }
1104   }
1105 
Synchronize(uint32_t wait_cycles)1106   void Synchronize(uint32_t wait_cycles) {
1107     // Insert dependences to force serialization
1108     if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
1109       insn_queue_.RewriteForceSerial();
1110     } else {
1111       // This will issue finish after last store finishes
1112       insn_queue_.DepPush(kStoreStage, kComputeStage);
1113       insn_queue_.DepPush(kLoadStage, kComputeStage);
1114       insn_queue_.DepPop(kStoreStage, kComputeStage);
1115       insn_queue_.DepPop(kLoadStage, kComputeStage);
1116       insn_queue_.CommitPendingPop(kComputeStage);
1117     }
1118     // NOTE: FINISH cannot contain pop
1119     VTAGemInsn* insn = insn_queue_.CreateGemInsn();
1120     insn->opcode = VTA_OPCODE_FINISH;
1121     CHECK(!insn_queue_.PendingPop());
1122     // Check if there are no instruction to execute at all
1123     if (insn_queue_.count() == 0) return;
1124     // Synchronization for the queues
1125     uop_queue_.AutoReadBarrier();
1126     insn_queue_.AutoReadBarrier();
1127     // Dump instructions if debug enabled
1128     if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
1129       insn_queue_.DumpInsn();
1130     }
1131     // Make sure that the last instruction is a finish instruction
1132     CHECK(reinterpret_cast<VTAMemInsn*>(
1133         insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
1134 
1135     // Make sure that we don't exceed contiguous physical memory limits
1136     CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
1137     int timeout = VTADeviceRun(
1138         device_,
1139         insn_queue_.dram_phy_addr(),
1140         insn_queue_.count(),
1141         wait_cycles);
1142     CHECK_EQ(timeout, 0);
1143     // Reset buffers
1144     uop_queue_.Reset();
1145     insn_queue_.Reset();
1146   }
1147 
1148   // Get record kernel
record_kernel() const1149   UopKernel* record_kernel() const {
1150     CHECK(record_kernel_ != nullptr);
1151     return record_kernel_;
1152   }
1153 
1154   // Set debug flag
SetDebugFlag(int debug_flag)1155   void SetDebugFlag(int debug_flag) {
1156     debug_flag_ = debug_flag;
1157   }
1158 
PushGEMMOp(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1159   void PushGEMMOp(void** uop_handle,
1160                   int (*finit)(void*),
1161                   void* signature,
1162                   int nbytes) {
1163     UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
1164     if (uptr[0] == nullptr) {
1165       uptr[0] = new UopKernelMap();
1166     }
1167     UopKernel** kptr = uptr[0]->Get(signature, nbytes);
1168     if (kptr[0] == nullptr) {
1169       record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
1170       CHECK_EQ((*finit)(signature), 0);
1171       kptr[0] = static_cast<UopKernel*>(record_kernel_);
1172       if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
1173         record_kernel_->Dump();
1174       }
1175       record_kernel_ = nullptr;
1176     }
1177     this->PushGEMMOp(static_cast<UopKernel*>(kptr[0]));
1178     this->CheckInsnOverFlow();
1179   }
1180 
PushALUUop(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1181   void PushALUUop(void** uop_handle,
1182                   int (*finit)(void*),
1183                   void* signature,
1184                   int nbytes) {
1185     UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
1186     if (uptr[0] == nullptr) {
1187       uptr[0] = new UopKernelMap();
1188     }
1189     UopKernel** kptr = uptr[0]->Get(signature, nbytes);
1190     if (kptr[0] == nullptr) {
1191       record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
1192       CHECK_EQ((*finit)(signature), 0);
1193       kptr[0] = static_cast<UopKernel*>(record_kernel_);
1194       if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
1195         record_kernel_->Dump();
1196       }
1197       record_kernel_ = nullptr;
1198     }
1199     this->PushALUUop(static_cast<UopKernel*>(kptr[0]));
1200     this->CheckInsnOverFlow();
1201   }
1202 
ThreadLocal()1203   static std::shared_ptr<CommandQueue>& ThreadLocal() {
1204     static std::shared_ptr<CommandQueue> inst =
1205         std::make_shared<CommandQueue>();
1206     if (inst == nullptr) {
1207       inst = std::make_shared<CommandQueue>();
1208     }
1209     return inst;
1210   }
1211 
Shutdown()1212   static void Shutdown() {
1213     ThreadLocal().reset();
1214   }
1215 
1216  private:
1217   // Push GEMM uop to the command buffer
PushGEMMOp(UopKernel * kernel)1218   void PushGEMMOp(UopKernel* kernel) {
1219     uop_queue_.Push(kernel,
1220                     [this]() { this->AutoSync(); });
1221     if (uop_queue_.pending()) {
1222       VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
1223       insn->opcode = VTA_OPCODE_LOAD;
1224       uop_queue_.FlushUopLoad(insn);
1225     }
1226     VTAGemInsn* insn = insn_queue_.CreateGemInsn();
1227     insn->opcode = VTA_OPCODE_GEMM;
1228     insn->reset_reg = kernel->reset_out_;
1229     insn->uop_bgn = kernel->sram_begin_;
1230     insn->uop_end = kernel->sram_end_;
1231     const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
1232     if (loop.size() > 0) {
1233       insn->iter_out = loop[0].extent;
1234       insn->wgt_factor_out = loop[0].wgt_factor;
1235       insn->src_factor_out = loop[0].src_factor;
1236       insn->dst_factor_out = loop[0].dst_factor;
1237     } else {
1238       insn->iter_out = 1;
1239       insn->wgt_factor_out = 0;
1240       insn->src_factor_out = 0;
1241       insn->dst_factor_out = 0;
1242     }
1243     if (loop.size() > 1) {
1244       insn->iter_in = loop[1].extent;
1245       insn->wgt_factor_in = loop[1].wgt_factor;
1246       insn->src_factor_in = loop[1].src_factor;
1247       insn->dst_factor_in = loop[1].dst_factor;
1248     } else {
1249       insn->iter_in = 1;
1250       insn->wgt_factor_in = 0;
1251       insn->src_factor_in = 0;
1252       insn->dst_factor_in = 0;
1253     }
1254   }
1255 
1256   // Push ALU uop to the command buffer
PushALUUop(UopKernel * kernel)1257   void PushALUUop(UopKernel* kernel) {
1258     uop_queue_.Push(kernel,
1259                     [this]() { this->AutoSync(); });
1260     if (uop_queue_.pending()) {
1261       VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
1262       insn->opcode = VTA_OPCODE_LOAD;
1263       uop_queue_.FlushUopLoad(insn);
1264     }
1265     VTAAluInsn* insn = insn_queue_.CreateAluInsn();
1266     insn->opcode = VTA_OPCODE_ALU;
1267     insn->reset_reg = kernel->reset_out_;
1268     insn->uop_bgn = kernel->sram_begin_;
1269     insn->uop_end = kernel->sram_end_;
1270     insn->alu_opcode = kernel->opcode_;
1271     insn->use_imm = kernel->use_imm_;
1272     insn->imm = kernel->imm_val_;
1273     const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
1274     if (loop.size() == 0) {
1275       insn->iter_out = 1;
1276       insn->dst_factor_out = 0;
1277       insn->src_factor_out = 0;
1278       insn->iter_in = 1;
1279       insn->dst_factor_in = 0;
1280       insn->src_factor_in = 0;
1281     } else if (loop.size() == 1) {
1282       insn->iter_out = 1;
1283       insn->dst_factor_out = 0;
1284       insn->src_factor_out = 0;
1285       insn->iter_in = loop[0].extent;
1286       insn->dst_factor_in = loop[0].dst_factor;
1287       insn->src_factor_in = loop[0].src_factor;
1288     } else {
1289       insn->iter_out = loop[0].extent;
1290       insn->dst_factor_out = loop[0].dst_factor;
1291       insn->src_factor_out = loop[0].src_factor;
1292       insn->iter_in = loop[1].extent;
1293       insn->dst_factor_in = loop[1].dst_factor;
1294       insn->src_factor_in = loop[1].src_factor;
1295     }
1296   }
1297 
CheckInsnOverFlow()1298   void CheckInsnOverFlow() {
1299     // At each API call, we can at most commit:
1300     // one pending store, one pending load, and one uop
1301     if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
1302       this->AutoSync();
1303     }
1304   }
1305   // Auto sync when instruction overflow
AutoSync()1306   void AutoSync() {
1307     this->Synchronize(1 << 31);
1308   }
1309 
1310   // Internal debug flag
1311   int debug_flag_{0};
1312   // The kernel we are currently recording
1313   UopKernel* record_kernel_{nullptr};
1314   // Micro op queue
1315   UopQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> uop_queue_;
1316   // instruction queue
1317   InsnQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> insn_queue_;
1318   // Device handle
1319   VTADeviceHandle device_{nullptr};
1320 };
1321 
1322 }  // namespace vta
1323 
VTABufferAlloc(size_t size)1324 void* VTABufferAlloc(size_t size) {
1325   return vta::DataBuffer::Alloc(size);
1326 }
1327 
VTABufferFree(void * buffer)1328 void VTABufferFree(void* buffer) {
1329   vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
1330 }
1331 
VTABufferCopy(const void * from,size_t from_offset,void * to,size_t to_offset,size_t size,int kind_mask)1332 void VTABufferCopy(const void* from,
1333                    size_t from_offset,
1334                    void* to,
1335                    size_t to_offset,
1336                    size_t size,
1337                    int kind_mask) {
1338   vta::DataBuffer* from_buffer = nullptr;
1339   vta::DataBuffer* to_buffer = nullptr;
1340 
1341   if (kind_mask & 2) {
1342     from_buffer = vta::DataBuffer::FromHandle(from);
1343     from = from_buffer->virt_addr();
1344   }
1345   if (kind_mask & 1) {
1346     to_buffer = vta::DataBuffer::FromHandle(to);
1347     to = to_buffer->virt_addr();
1348   }
1349 
1350   if (from_buffer) {
1351     // This is an FPGA to host mem transfer
1352     from_buffer->InvalidateCache(from_offset, size);
1353     from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
1354                                    static_cast<const char*>(from) + from_offset,
1355                                    size);
1356   } else if (to_buffer) {
1357     // This is a host to FPGA mem transfer
1358     to_buffer->MemCopyFromHost(static_cast<char*>(to) + to_offset,
1359                                static_cast<const char*>(from) + from_offset,
1360                                size);
1361     to_buffer->FlushCache(to_offset, size);
1362   }
1363 }
1364 
VTATLSCommandHandle()1365 VTACommandHandle VTATLSCommandHandle() {
1366   return vta::CommandQueue::ThreadLocal().get();
1367 }
1368 
VTARuntimeShutdown()1369 void VTARuntimeShutdown() {
1370   vta::CommandQueue::Shutdown();
1371 }
1372 
VTASetDebugMode(VTACommandHandle cmd,int debug_flag)1373 void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
1374   static_cast<vta::CommandQueue*>(cmd)->
1375       SetDebugFlag(debug_flag);
1376 }
1377 
VTABufferCPUPtr(VTACommandHandle cmd,void * buffer)1378 void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
1379   return vta::DataBuffer::FromHandle(buffer)->virt_addr();
1380 }
1381 
VTAWriteBarrier(VTACommandHandle cmd,void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1382 void VTAWriteBarrier(VTACommandHandle cmd,
1383                      void* buffer,
1384                      uint32_t elem_bits,
1385                      uint32_t start,
1386                      uint32_t extent) {
1387   static_cast<vta::CommandQueue*>(cmd)->
1388       WriteBarrier(buffer, elem_bits, start, extent);
1389 }
1390 
VTAReadBarrier(VTACommandHandle cmd,void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1391 void VTAReadBarrier(VTACommandHandle cmd,
1392                     void* buffer,
1393                     uint32_t elem_bits,
1394                     uint32_t start,
1395                     uint32_t extent) {
1396   static_cast<vta::CommandQueue*>(cmd)->
1397       ReadBarrier(buffer, elem_bits, start, extent);
1398 }
1399 
VTALoadBuffer2D(VTACommandHandle cmd,void * src_dram_addr,uint32_t src_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride,uint32_t x_pad_before,uint32_t y_pad_before,uint32_t x_pad_after,uint32_t y_pad_after,uint32_t dst_sram_index,uint32_t dst_memory_type)1400 void VTALoadBuffer2D(VTACommandHandle cmd,
1401                      void* src_dram_addr,
1402                      uint32_t src_elem_offset,
1403                      uint32_t x_size,
1404                      uint32_t y_size,
1405                      uint32_t x_stride,
1406                      uint32_t x_pad_before,
1407                      uint32_t y_pad_before,
1408                      uint32_t x_pad_after,
1409                      uint32_t y_pad_after,
1410                      uint32_t dst_sram_index,
1411                      uint32_t dst_memory_type) {
1412   static_cast<vta::CommandQueue*>(cmd)->
1413       LoadBuffer2D(src_dram_addr, src_elem_offset,
1414                    x_size, y_size, x_stride,
1415                    x_pad_before, y_pad_before,
1416                    x_pad_after, y_pad_after,
1417                    dst_sram_index, dst_memory_type);
1418 }
1419 
VTAStoreBuffer2D(VTACommandHandle cmd,uint32_t src_sram_index,uint32_t src_memory_type,void * dst_dram_addr,uint32_t dst_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride)1420 void VTAStoreBuffer2D(VTACommandHandle cmd,
1421                       uint32_t src_sram_index,
1422                       uint32_t src_memory_type,
1423                       void* dst_dram_addr,
1424                       uint32_t dst_elem_offset,
1425                       uint32_t x_size,
1426                       uint32_t y_size,
1427                       uint32_t x_stride) {
1428   static_cast<vta::CommandQueue*>(cmd)->
1429       StoreBuffer2D(src_sram_index, src_memory_type,
1430                     dst_dram_addr, dst_elem_offset,
1431                     x_size, y_size, x_stride);
1432 }
1433 
VTAUopPush(uint32_t mode,uint32_t reset_out,uint32_t dst_index,uint32_t src_index,uint32_t wgt_index,uint32_t opcode,uint32_t use_imm,int32_t imm_val)1434 void VTAUopPush(uint32_t mode,
1435                 uint32_t reset_out,
1436                 uint32_t dst_index,
1437                 uint32_t src_index,
1438                 uint32_t wgt_index,
1439                 uint32_t opcode,
1440                 uint32_t use_imm,
1441                 int32_t imm_val) {
1442   vta::CommandQueue::ThreadLocal()->record_kernel()
1443       ->Push(mode, reset_out, dst_index, src_index,
1444              wgt_index, opcode, use_imm, imm_val);
1445 }
1446 
VTAUopLoopBegin(uint32_t extent,uint32_t dst_factor,uint32_t src_factor,uint32_t wgt_factor)1447 void VTAUopLoopBegin(uint32_t extent,
1448                      uint32_t dst_factor,
1449                      uint32_t src_factor,
1450                      uint32_t wgt_factor) {
1451   vta::CommandQueue::ThreadLocal()->record_kernel()
1452       ->PushLoopBegin(extent, dst_factor, src_factor, wgt_factor);
1453 }
1454 
VTAUopLoopEnd()1455 void VTAUopLoopEnd() {
1456   vta::CommandQueue::ThreadLocal()->record_kernel()
1457       ->PushLoopEnd();
1458 }
1459 
VTAPushGEMMOp(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1460 int VTAPushGEMMOp(void** uop_handle,
1461                   int (*finit)(void*),
1462                   void* signature,
1463                   int nbytes) {
1464   vta::CommandQueue::ThreadLocal()->
1465       PushGEMMOp(uop_handle, finit, signature, nbytes);
1466   return 0;
1467 }
1468 
VTAPushALUOp(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1469 int VTAPushALUOp(void** uop_handle,
1470                  int (*finit)(void*),
1471                  void* signature,
1472                  int nbytes) {
1473   vta::CommandQueue::ThreadLocal()->
1474       PushALUUop(uop_handle, finit, signature, nbytes);
1475   return 0;
1476 }
1477 
VTADepPush(VTACommandHandle cmd,int from_qid,int to_qid)1478 int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) {
1479   static_cast<vta::CommandQueue*>(cmd)->
1480       DepPush(from_qid, to_qid);
1481   return 0;
1482 }
1483 
VTADepPop(VTACommandHandle cmd,int from_qid,int to_qid)1484 int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
1485   static_cast<vta::CommandQueue*>(cmd)->
1486       DepPop(from_qid, to_qid);
1487   return 0;
1488 }
1489 
VTASynchronize(VTACommandHandle cmd,uint32_t wait_cycles)1490 void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
1491   static_cast<vta::CommandQueue*>(cmd)->
1492       Synchronize(wait_cycles);
1493 }
1494