1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 /*!
21 * \file runtime.cc
22 * \brief Generic VTA runtime in C++11.
23 *
24 * The runtime depends on specific instruction
25 * stream spec as specified in hw_spec.h
26 */
27 #include <vta/driver.h>
28 #include <vta/hw_spec.h>
29 #include <vta/runtime.h>
30 #include <dmlc/logging.h>
31 #include <tvm/runtime/c_runtime_api.h>
32
33 #include <cassert>
34 #include <cstring>
35 #include <vector>
36 #include <memory>
37
38 namespace vta {
39
40 // Avoid bad configurations.
41 static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8,
42 "VTA_UOP_WIDTH do not match VTAUop size");
43
44 /*! \brief Enable coherent access of data buffers between VTA and CPU */
45 static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
46 /*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
47 static const bool kAlwaysCache = true;
48
49 /*!
50 * \brief Data buffer represents data on CMA.
51 */
52 struct DataBuffer {
53 /*! \return Virtual address of the data. */
virt_addrvta::DataBuffer54 void* virt_addr() const {
55 return data_;
56 }
57 /*! \return Physical address of the data. */
phy_addrvta::DataBuffer58 vta_phy_addr_t phy_addr() const {
59 return phy_addr_;
60 }
61 /*!
62 * \brief Invalidate the cache of given location in data buffer.
63 * \param offset The offset to the data.
64 * \param size The size of the data.
65 */
InvalidateCachevta::DataBuffer66 void InvalidateCache(size_t offset, size_t size) {
67 if (!kBufferCoherent && kAlwaysCache) {
68 VTAInvalidateCache(reinterpret_cast<char *>(data_) + offset,
69 phy_addr_ + offset,
70 size);
71 }
72 }
73 /*!
74 * \brief Invalidate the cache of certain location in data buffer.
75 * \param offset The offset to the data.
76 * \param size The size of the data.
77 */
FlushCachevta::DataBuffer78 void FlushCache(size_t offset, size_t size) {
79 if (!kBufferCoherent && kAlwaysCache) {
80 VTAFlushCache(reinterpret_cast<char *>(data_) + offset,
81 phy_addr_ + offset,
82 size);
83 }
84 }
85 /*!
86 * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
87 * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
88 * \param src The source buffer in host memory.
89 * \param size Size of the region in Bytes.
90 */
MemCopyFromHostvta::DataBuffer91 void MemCopyFromHost(void* dst, const void* src, size_t size) {
92 VTAMemCopyFromHost(dst, src, size);
93 }
94 /*!
95 * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
96 * \param dst The desination buffer in host memory.
97 * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
98 * \param size Size of the region in Bytes.
99 */
MemCopyToHostvta::DataBuffer100 void MemCopyToHost(void* dst, const void* src, size_t size) {
101 VTAMemCopyToHost(dst, src, size);
102 }
103 /*!
104 * \brief Allocate a buffer of a given size.
105 * \param size The size of the buffer.
106 */
Allocvta::DataBuffer107 static DataBuffer* Alloc(size_t size) {
108 void* data = VTAMemAlloc(size, kAlwaysCache);
109 CHECK(data != nullptr);
110 DataBuffer* buffer = new DataBuffer();
111 buffer->data_ = data;
112 buffer->phy_addr_ = VTAMemGetPhyAddr(data);
113 return buffer;
114 }
115 /*!
116 * \brief Free the data buffer.
117 * \param buffer The buffer to be freed.
118 */
Freevta::DataBuffer119 static void Free(DataBuffer* buffer) {
120 VTAMemFree(buffer->data_);
121 delete buffer;
122 }
123 /*!
124 * \brief Create data buffer header from buffer ptr.
125 * \param buffer The buffer pointer.
126 * \return The corresponding data buffer header.
127 */
FromHandlevta::DataBuffer128 static DataBuffer* FromHandle(const void* buffer) {
129 return const_cast<DataBuffer*>(
130 reinterpret_cast<const DataBuffer*>(buffer));
131 }
132
133 private:
134 /*! \brief The internal data. */
135 void* data_;
136 /*! \brief The physical address of the buffer, excluding header. */
137 vta_phy_addr_t phy_addr_;
138 };
139
140 /*!
141 * \brief Micro op kernel.
142 * Contains functions to construct the kernel with prefix Push.
143 */
144 class UopKernel {
145 public:
146 /*! \brief Loop information. */
147 struct LoopEntry {
148 uint32_t extent;
149 uint32_t dst_factor;
150 uint32_t src_factor;
151 uint32_t wgt_factor;
152 };
153 /*!
154 * \brief Construct UopKernel with signature.
155 * \param signature The pointer to signature.
156 * \param nbytes Number of bytes.
157 */
UopKernel(const char * signature,int nbytes)158 UopKernel(const char* signature, int nbytes)
159 : signature_(signature, signature + nbytes) {
160 }
161 /*!
162 * \brief Verify if the signature is correct.
163 * \param signature Signature ptr.
164 * \param nbytes Number of bytes.
165 */
MatchSignature(void * signature,int nbytes) const166 bool MatchSignature(void* signature, int nbytes) const {
167 if (static_cast<size_t>(nbytes) != signature_.size()) return false;
168 return memcmp(signature, signature_.data(), nbytes) == 0;
169 }
170 /*! \return Whether the kernel is cached in SRAM. */
cached() const171 bool cached() const {
172 return sram_begin_ != sram_end_;
173 }
174 /*! \return The length of the micro op sequence. */
size() const175 size_t size() const {
176 return seq_.size();
177 }
178 /*! \return The micro-op data. */
data() const179 const VTAUop* data() const {
180 return seq_.data();
181 }
182 /*! \return The loop structure. */
loop() const183 const std::vector<LoopEntry>& loop() const {
184 return loop_;
185 }
186 /*!
187 * \brief Declare loop start.
188 * \param extent The loop extent.
189 * \param dst_factor Loop factor of accum index.
190 * \param src_factor Loop factor of input index
191 * \param wgt_factor Loop factor of weight index.
192 */
PushLoopBegin(uint32_t extent,uint32_t dst_factor,uint32_t src_factor,uint32_t wgt_factor)193 void PushLoopBegin(uint32_t extent,
194 uint32_t dst_factor,
195 uint32_t src_factor,
196 uint32_t wgt_factor) {
197 LoopEntry le;
198 le.extent = extent;
199 le.dst_factor = dst_factor;
200 le.src_factor = src_factor;
201 le.wgt_factor = wgt_factor;
202 CHECK_EQ(seq_.size(), 0U);
203 CHECK_LT(loop_.size(), 2U);
204 loop_.push_back(le);
205 ++loop_ptr_;
206 }
207 /*!
208 * \brief Declare loop end.
209 */
PushLoopEnd()210 void PushLoopEnd() {
211 --loop_ptr_;
212 }
213 /*!
214 * \brief Push micro op into kernel.
215 * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
216 * \param reset_out Resets the accum to 0.
217 * \param dst_index The accum memory index.
218 * \param src_index The input memory (gemm) / accum memory (alu) index.
219 * \param wgt_index The weight memory index.
220 * \param opcode The ALU opcode.
221 * \param use_imm Use immediate in ALU mode if set to true.
222 * \param imm_val Immediate value in ALU mode.
223 */
Push(uint32_t mode,uint32_t reset_out,uint32_t dst_index,uint32_t src_index,uint32_t wgt_index,uint32_t opcode,uint32_t use_imm,int32_t imm_val)224 void Push(uint32_t mode,
225 uint32_t reset_out,
226 uint32_t dst_index,
227 uint32_t src_index,
228 uint32_t wgt_index,
229 uint32_t opcode,
230 uint32_t use_imm,
231 int32_t imm_val) {
232 // The loop nest structure
233 VerifyDep(dst_index);
234 VTAUop op;
235 op.dst_idx = dst_index;
236 op.src_idx = src_index;
237 op.wgt_idx = wgt_index;
238 seq_.push_back(op);
239 // Ensure that mode is consistent if set
240 if (mode_ == 0xFFFFFFFF) {
241 mode_ = mode;
242 } else {
243 CHECK(mode_ == mode);
244 }
245 // Set reset_out field if unset
246 if (reset_out_ == 0xFFFFFFFF) {
247 reset_out_ = reset_out;
248 } else {
249 CHECK(reset_out_ == reset_out);
250 }
251 // Check kernel op and imm/imm_val in ALU mode
252 if (mode == 1) {
253 if (opcode_ == 0xFFFFFFFF) {
254 opcode_ = opcode;
255 use_imm_ = use_imm;
256 imm_val_ = imm_val;
257 } else {
258 CHECK(opcode_ == opcode);
259 CHECK(use_imm_ == use_imm);
260 CHECK(imm_val_ == imm_val);
261 }
262 }
263 }
264 /*! \brief Dump kernel micro ops to stdout. */
Dump()265 void Dump() {
266 uint32_t size = seq_.size();
267 printf("There are %u uops\n", size);
268 for (uint32_t i = 0; i < size; ++i) {
269 printf("[%04u]\t acc=%u, inp=%u, wgt=%u\n",
270 i,
271 seq_[i].dst_idx,
272 seq_[i].src_idx,
273 seq_[i].wgt_idx);
274 }
275 printf("\n");
276 }
277
278 public:
279 // The kernel's mode, opcode, immediate setting and value
280 uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
281 uint32_t opcode_{0xFFFFFFFF};
282 uint32_t reset_out_{0xFFFFFFFF};
283 bool use_imm_{false};
284 int16_t imm_val_{0};
285
286 private:
287 // Verify that we don't write to the same acc_mem index two cycles in a row
VerifyDep(uint32_t dst_index)288 void VerifyDep(uint32_t dst_index) {
289 size_t step = std::min(static_cast<size_t>(2U), seq_.size());
290 for (size_t i = seq_.size() - step; i < seq_.size(); ++i) {
291 CHECK(seq_[i].dst_idx != dst_index);
292 }
293 }
294 // The uop buffer
295 template<int, bool, bool>
296 friend class UopQueue;
297 friend class CommandQueue;
298 // SRAM location if begin != end
299 uint32_t sram_begin_{0};
300 uint32_t sram_end_{0};
301 // The signature used for verification
302 std::vector<char> signature_;
303 // Internal sequence
304 std::vector<VTAUop> seq_;
305 // The loop nest structure specific to ALU instructions
306 std::vector<LoopEntry> loop_;
307 // The loop pointer
308 size_t loop_ptr_{0};
309 };
310
311 /*!
312 * \brief Base class of all queues to send and recv serial data.
313 */
314 template <class T>
315 class BaseQueue {
316 public:
~BaseQueue()317 ~BaseQueue() {
318 if (fpga_buff_ != nullptr) {
319 VTAMemFree(fpga_buff_);
320 }
321 }
322 /*! \return Content of DRAM buffer. */
dram_buffer() const323 char* dram_buffer() const {
324 return dram_buffer_;
325 }
326 /*! \return Physical address of DRAM. */
dram_phy_addr() const327 vta_phy_addr_t dram_phy_addr() const {
328 CHECK(fpga_buff_phy_);
329 return fpga_buff_phy_;
330 }
331 /*! \return Whether there is pending information. */
pending() const332 bool pending() const {
333 return sram_begin_ != sram_end_;
334 }
335 /*! \brief Initialize the space of the buffer. */
InitSpace(uint32_t elem_bytes,uint32_t max_bytes,bool coherent,bool always_cache)336 void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) {
337 coherent_ = coherent;
338 always_cache_ = always_cache;
339 elem_bytes_ = elem_bytes;
340 // Allocate buffer ahead of time
341 fpga_buff_ = static_cast<char*>(VTAMemAlloc(
342 max_bytes, coherent_ || always_cache_));
343 CHECK(fpga_buff_ != nullptr);
344 fpga_buff_phy_ = VTAMemGetPhyAddr(fpga_buff_);
345 }
346 /*!
347 * \brief Reset the pointer of the buffer.
348 * Set SRAM pointer to be the current end.
349 */
Reset()350 virtual void Reset() {
351 dram_buffer_.clear();
352 sram_begin_ = sram_end_;
353 }
354
355 protected:
356 // Cache coherence access (shared memory only)
357 bool coherent_{false};
358 // Make the buffer cacheable
359 bool always_cache_{false};
360 // Element bytes
361 uint32_t elem_bytes_{0};
362 // Begin location of current SRAM read in FIFO mode
363 uint32_t sram_begin_{0};
364 // End location of current SRAM write in FIFO mode
365 uint32_t sram_end_{0};
366 // The buffer in DRAM
367 std::vector<T> dram_buffer_;
368 // FPGA accessible buffer
369 void* fpga_buff_{NULL};
370 // Physical address of the FPGA buffer
371 vta_phy_addr_t fpga_buff_phy_{0};
372 };
373
374 /*!
375 * \brief Micro op buffer that manages the micro op cache.
376 */
377 template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
378 class UopQueue : public BaseQueue<VTAUop> {
379 public:
InitSpace()380 void InitSpace() {
381 BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
382 }
383 // Push data to the queue
384 template<typename FAutoSync>
Push(UopKernel * kernel,FAutoSync fautosync)385 void Push(UopKernel* kernel, FAutoSync fautosync) {
386 // if the micro-op is cached in VTA SRAM, skip
387 if (kernel->cached()) return;
388 // check if we've exceeded the size of the allocated FPGA readable buffer
389 size_t num_op = kernel->size();
390 if (dram_buffer_.size() + num_op > kMaxElems) {
391 fautosync();
392 CHECK(dram_buffer_.size() <= kMaxElems);
393 }
394 // Cannot have a micro-op kernel larger than SRAM buffer
395 CHECK(num_op <= kMaxNumUop);
396 uint32_t uop_begin = 0;
397 if (sram_end_ + num_op > kMaxNumUop) {
398 // Need to evict
399 cache_idx_ = 0;
400 sram_begin_ = 0;
401 sram_end_ = num_op;
402 } else {
403 uop_begin = sram_end_;
404 sram_end_ += num_op;
405 }
406 // Simple eviction policy
407 uint32_t evict_begin = cache_idx_;
408 for (; cache_idx_ < cache_.size(); ++cache_idx_) {
409 if (cache_[cache_idx_]->sram_begin_ >= sram_end_) break;
410 // Mark the kernel as "invalid"
411 cache_[cache_idx_]->sram_begin_ = 0;
412 cache_[cache_idx_]->sram_end_ = 0;
413 }
414 // Increase size of buffer
415 kernel->sram_begin_ = uop_begin;
416 kernel->sram_end_ = sram_end_;
417 CHECK(kernel->cached());
418 cache_.insert(cache_.begin() + cache_idx_, kernel);
419 cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_idx_);
420 cache_idx_ = evict_begin + 1;
421 }
422 // Flush micro op load instruction
FlushUopLoad(VTAMemInsn * insn)423 void FlushUopLoad(VTAMemInsn* insn) {
424 if (sram_begin_ != sram_end_) {
425 // Derive offset in FPGA-readable buffer
426 int32_t offset = 0;
427 for (uint32_t i = 0; i < cache_idx_ - 1; ++i) {
428 offset += cache_[i]->size() * kElemBytes;
429 }
430 insn->memory_type = VTA_MEM_ID_UOP;
431 insn->sram_base = sram_begin_;
432 // Update cache idx to physical address map
433 insn->dram_base = (fpga_buff_phy_ + offset) / kElemBytes;
434 insn->y_size = 1;
435 insn->x_size = (sram_end_ - sram_begin_);
436 insn->x_stride = (sram_end_ - sram_begin_);
437 insn->y_pad_0 = 0;
438 insn->y_pad_1 = 0;
439 insn->x_pad_0 = 0;
440 insn->x_pad_1 = 0;
441 // Reset indices
442 sram_begin_ = sram_end_;
443 }
444 }
445 /*! \brief clear cache and reset base queue buffer.*/
Reset()446 void Reset() {
447 cache_.clear();
448 cache_idx_ = 0;
449 BaseQueue<VTAUop>::Reset();
450 }
AutoReadBarrier()451 void AutoReadBarrier() {
452 ReadBarrier();
453 }
454 /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
ReadBarrier()455 void ReadBarrier() {
456 CHECK(fpga_buff_ != nullptr);
457 CHECK(fpga_buff_phy_);
458 // Iterate over caches; allocate buffer in FPGA-readable memory
459 uint32_t buff_size = 0;
460 for (uint32_t i = 0; i < cache_.size(); ++i) {
461 buff_size += cache_[i]->size() * kElemBytes;
462 }
463 CHECK(buff_size <= kMaxBytes);
464 // Move kernel contents to FPGA readable buffer
465 uint32_t offset = 0;
466 for (uint32_t i = 0; i < cache_.size(); ++i) {
467 uint32_t ksize = cache_[i]->size() * kElemBytes;
468 VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset,
469 cache_[i]->data(),
470 ksize);
471 // Update offset
472 offset += ksize;
473 }
474 // Flush if we're using a shared memory system
475 // and if interface is non-coherent
476 if (!coherent_ && always_cache_) {
477 VTAFlushCache(fpga_buff_,
478 fpga_buff_phy_,
479 offset);
480 }
481 }
482
483 private:
484 // Cache pointer
485 uint32_t cache_idx_{0};
486 // Cached ring, sorted by sram_begin
487 std::vector<UopKernel*> cache_;
488 // Constants
489 static constexpr int kElemBytes = sizeof(VTAUop);
490 static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH;
491 static constexpr int kMaxElems = kMaxBytes / kElemBytes;
492 };
493
494 // Internal kernel structure
495 class UopKernelMap {
496 public:
497 // Simple hash map
Get(void * signature,int nbytes)498 UopKernel** Get(void* signature,
499 int nbytes) {
500 uint32_t key = 0;
501 CHECK(nbytes == 0 || nbytes == sizeof(int));
502 if (nbytes == sizeof(int)) {
503 memcpy(&key, signature, sizeof(int));
504 key = key + 1;
505 }
506 CHECK_LT(key, 100);
507 if (kmap_.size() <= key) {
508 kmap_.resize(key + 1, nullptr);
509 }
510 return &(kmap_[key]);
511 }
512
513 private:
514 std::vector<UopKernel*> kmap_;
515 };
516
517 enum PipelineStage : int {
518 kNoneStage = 0,
519 kLoadStage = 1,
520 kComputeStage = 2,
521 kStoreStage = 3
522 };
523
524 // Instruction Queue
525 template<int kMaxBytes, bool kCoherent, bool kAlwaysCache>
526 class InsnQueue : public BaseQueue<VTAGenericInsn> {
527 public:
528 /*! \brief Initialize the space. */
InitSpace()529 void InitSpace() {
530 BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
531 // Initialize the stage
532 std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0);
533 std::fill(pending_pop_next_, pending_pop_next_ + 4, 0);
534 }
535 /*! \return The data pointer. */
data()536 VTAGenericInsn* data() {
537 return dram_buffer_.data();
538 }
539 /*! \return Number of instructions. */
count()540 uint32_t count() {
541 return dram_buffer_.size();
542 }
543 // Insert dependency push of load
DepPop(int from,int to)544 void DepPop(int from, int to) {
545 // NOTE: This instruction executes on queue[to]
546 if (from < to) {
547 if (pending_pop_prev_[to]) {
548 this->CommitPendingPop(to);
549 }
550 pending_pop_prev_[to] = 1;
551 } else {
552 if (pending_pop_next_[to]) {
553 this->CommitPendingPop(to);
554 }
555 pending_pop_next_[to] = 1;
556 }
557 // Impossible condition
558 CHECK(from != kLoadStage || to != kStoreStage);
559 CHECK(from != kStoreStage || to != kLoadStage);
560 }
561 // Insert dependency push of load
DepPush(int from,int to)562 void DepPush(int from, int to) {
563 // NOTE: this instruction executes on queue[from]
564 this->CommitPendingPop(from);
565 if (!dram_buffer_.empty()) {
566 VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(&dram_buffer_.back());
567 if (GetPipelineStage(mptr) == from) {
568 if (from < to && !mptr->push_next_dep) {
569 // push(LD->C) or push(C->ST)
570 mptr->push_next_dep = true; return;
571 } else if (from > to && !mptr->push_prev_dep) {
572 // push(C->LD) or push(ST->C)
573 mptr->push_prev_dep = true; return;
574 }
575 }
576 }
577 if (from < to) {
578 // Push next dep
579 PushNoop(from, false, true, false, false);
580 } else {
581 // Push prev dep
582 PushNoop(from, true, false, false, false);
583 }
584 }
585 // Create a new instruction for a GEMM stage
CreateGemInsn()586 VTAGemInsn* CreateGemInsn() {
587 return reinterpret_cast<VTAGemInsn*>(
588 Create(kComputeStage));
589 }
590 // Create a new instruction for a ALU stage
CreateAluInsn()591 VTAAluInsn* CreateAluInsn() {
592 return reinterpret_cast<VTAAluInsn*>(
593 Create(kComputeStage));
594 }
595 // Create a new instruction for a memory stage
CreateMemInsn(int memory_type)596 VTAMemInsn* CreateMemInsn(int memory_type) {
597 return reinterpret_cast<VTAMemInsn*>(
598 Create(GetMemPipelineStage(memory_type)));
599 }
600 // create a new instruction for a store stage
CreateStoreInsn()601 VTAMemInsn* CreateStoreInsn() {
602 return reinterpret_cast<VTAMemInsn*>(
603 Create(kStoreStage));
604 }
605 // Rewrite instruction stream to force serial execution
RewriteForceSerial()606 void RewriteForceSerial() {
607 int insn_count = count();
608 VTAMemInsn* mem_ptr = reinterpret_cast<VTAMemInsn*>(data());
609 VTAMemInsn* mem_last_store_ptr = nullptr;
610 VTAMemInsn* mem_last_ptr = nullptr;
611 for (int i = 1; i < insn_count; ++i) {
612 PipelineStage prev = GetPipelineStageAll(mem_ptr + i - 1);
613 PipelineStage now = GetPipelineStageAll(mem_ptr + i);
614 if (prev == kLoadStage && now == kComputeStage) {
615 mem_ptr[i - 1].push_prev_dep = false;
616 mem_ptr[i - 1].push_next_dep = true;
617 mem_ptr[i].pop_prev_dep = true;
618 mem_ptr[i].pop_next_dep = false;
619 } else if (prev == kComputeStage && now == kLoadStage) {
620 mem_ptr[i - 1].push_prev_dep = true;
621 mem_ptr[i - 1].push_next_dep = false;
622 mem_ptr[i].pop_prev_dep = false;
623 mem_ptr[i].pop_next_dep = true;
624 } else if (prev == kStoreStage && now == kComputeStage) {
625 mem_ptr[i - 1].push_prev_dep = true;
626 mem_ptr[i - 1].push_next_dep = false;
627 mem_ptr[i].pop_prev_dep = false;
628 mem_ptr[i].pop_next_dep = true;
629 } else if (prev == kComputeStage && now == kStoreStage) {
630 mem_ptr[i - 1].push_prev_dep = false;
631 mem_ptr[i - 1].push_next_dep = true;
632 mem_ptr[i].pop_prev_dep = true;
633 mem_ptr[i].pop_next_dep = false;
634 } else {
635 mem_ptr[i - 1].push_prev_dep = false;
636 mem_ptr[i - 1].push_next_dep = false;
637 mem_ptr[i].pop_prev_dep = false;
638 mem_ptr[i].pop_next_dep = false;
639 }
640 if (now == kStoreStage) {
641 mem_last_store_ptr = &mem_ptr[i];
642 }
643 mem_last_ptr = &mem_ptr[i];
644 }
645 // set dependency to make sure all core instruction get excuted
646 // before last FINISH instruction
647 if (mem_last_store_ptr && mem_last_ptr == mem_last_store_ptr) {
648 mem_last_store_ptr->push_prev_dep = true;
649 if (!pending_pop_next_[kComputeStage]) {
650 DepPop(kStoreStage, kComputeStage);
651 }
652 CommitPendingPop(kComputeStage);
653 } else {
654 pending_pop_next_[kComputeStage] = 0;
655 }
656 DepPush(kComputeStage, kLoadStage);
657 DepPop(kLoadStage, kComputeStage);
658 if (!pending_pop_next_[kLoadStage]) {
659 DepPop(kComputeStage, kLoadStage);
660 }
661 CommitPendingPop(kLoadStage);
662 DepPush(kLoadStage, kComputeStage);
663 CommitPendingPop(kComputeStage);
664 }
665 // Helper function: Get Opcode string
getOpcodeString(int opcode,bool use_imm)666 const char* getOpcodeString(int opcode, bool use_imm) {
667 // The string name
668 if (opcode == VTA_ALU_OPCODE_MIN) {
669 if (use_imm) {
670 return "min imm";
671 } else {
672 return "min";
673 }
674 } else if (opcode == VTA_ALU_OPCODE_MAX) {
675 if (use_imm) {
676 return "max imm";
677 } else {
678 return "max";
679 }
680 } else if (opcode == VTA_ALU_OPCODE_ADD) {
681 if (use_imm) {
682 return "add imm";
683 } else {
684 return "add";
685 }
686 } else if (opcode == VTA_ALU_OPCODE_SHR) {
687 return "shr";
688 }
689
690 return "unknown op";
691 }
692 // Dump instructions in the queue
DumpInsn()693 void DumpInsn() {
694 // Keep tabs on dependence queues
695 int l2g_queue = 0;
696 int g2l_queue = 0;
697 int s2g_queue = 0;
698 int g2s_queue = 0;
699 // Converter
700 union VTAInsn c;
701 // Iterate over all instructions
702 int insn_count = count();
703 const VTAGenericInsn* insn = data();
704 printf("There are %u instructions\n", insn_count);
705 for (int i = 0; i < insn_count; ++i) {
706 // Fetch instruction and decode opcode
707 c.generic = insn[i];
708 printf("INSTRUCTION %u: ", i);
709 if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
710 if (c.mem.x_size == 0) {
711 if (c.mem.opcode == VTA_OPCODE_STORE) {
712 printf("NOP-STORE-STAGE\n");
713 } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
714 printf("NOP-COMPUTE-STAGE\n");
715 } else {
716 printf("NOP-MEMORY-STAGE\n");
717 }
718 printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
719 static_cast<int>(c.mem.pop_prev_dep),
720 static_cast<int>(c.mem.pop_next_dep),
721 static_cast<int>(c.mem.push_prev_dep),
722 static_cast<int>(c.mem.push_next_dep));
723 // Count status in queues
724 if (c.mem.opcode == VTA_OPCODE_STORE) {
725 CHECK(c.mem.pop_next_dep == false);
726 CHECK(c.mem.push_next_dep == false);
727 if (c.mem.pop_prev_dep) g2s_queue--;
728 if (c.mem.push_prev_dep) s2g_queue++;
729 } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
730 (c.mem.memory_type == VTA_MEM_ID_INP ||
731 c.mem.memory_type == VTA_MEM_ID_WGT) ) {
732 CHECK(c.mem.pop_prev_dep == false);
733 CHECK(c.mem.push_prev_dep == false);
734 if (c.mem.pop_next_dep) g2l_queue--;
735 if (c.mem.push_next_dep) l2g_queue++;
736 } else {
737 if (c.mem.pop_prev_dep) l2g_queue--;
738 if (c.mem.push_prev_dep) g2l_queue++;
739 if (c.mem.pop_next_dep) s2g_queue--;
740 if (c.mem.push_next_dep) g2s_queue++;
741 }
742 printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
743 printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
744 continue;
745 }
746 // Print instruction field information
747 if (c.mem.opcode == VTA_OPCODE_LOAD) {
748 printf("LOAD ");
749 if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
750 if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
751 if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
752 if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
753 }
754 if (c.mem.opcode == VTA_OPCODE_STORE) {
755 printf("STORE:\n");
756 }
757 printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
758 static_cast<int>(c.mem.pop_prev_dep),
759 static_cast<int>(c.mem.pop_next_dep),
760 static_cast<int>(c.mem.push_prev_dep),
761 static_cast<int>(c.mem.push_next_dep));
762 printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
763 static_cast<int>(c.mem.dram_base),
764 static_cast<int>(c.mem.sram_base));
765 printf("\ty: size=%d, pad=[%d, %d]\n",
766 static_cast<int>(c.mem.y_size),
767 static_cast<int>(c.mem.y_pad_0),
768 static_cast<int>(c.mem.y_pad_1));
769 printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
770 static_cast<int>(c.mem.x_size),
771 static_cast<int>(c.mem.x_stride),
772 static_cast<int>(c.mem.x_pad_0),
773 static_cast<int>(c.mem.x_pad_1));
774 } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
775 // Print instruction field information
776 printf("GEMM\n");
777
778 printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
779 static_cast<int>(c.mem.pop_prev_dep),
780 static_cast<int>(c.mem.pop_next_dep),
781 static_cast<int>(c.mem.push_prev_dep),
782 static_cast<int>(c.mem.push_next_dep));
783 printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
784 printf("\trange (%d, %d)\n",
785 static_cast<int>(c.gemm.uop_bgn),
786 static_cast<int>(c.gemm.uop_end));
787 printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
788 static_cast<int>(c.gemm.iter_out),
789 static_cast<int>(c.gemm.wgt_factor_out),
790 static_cast<int>(c.gemm.src_factor_out),
791 static_cast<int>(c.gemm.dst_factor_out));
792 printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
793 static_cast<int>(c.gemm.iter_in),
794 static_cast<int>(c.gemm.wgt_factor_in),
795 static_cast<int>(c.gemm.src_factor_in),
796 static_cast<int>(c.gemm.dst_factor_in));
797 } else if (c.mem.opcode == VTA_OPCODE_ALU) {
798 // Print instruction field information
799 printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
800 printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
801 static_cast<int>(c.mem.pop_prev_dep),
802 static_cast<int>(c.mem.pop_next_dep),
803 static_cast<int>(c.mem.push_prev_dep),
804 static_cast<int>(c.mem.push_next_dep));
805 printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
806 printf("\trange (%d, %d)\n",
807 static_cast<int>(c.alu.uop_bgn),
808 static_cast<int>(c.alu.uop_end));
809 printf("\touter loop - iter: %d, dst: %d, src: %d\n",
810 static_cast<int>(c.alu.iter_out),
811 static_cast<int>(c.alu.dst_factor_out),
812 static_cast<int>(c.alu.src_factor_out));
813 printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
814 static_cast<int>(c.alu.iter_in),
815 static_cast<int>(c.alu.dst_factor_in),
816 static_cast<int>(c.alu.src_factor_in));
817 } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
818 printf("FINISH\n");
819 }
820
821 // Count status in queues
822 if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
823 if (c.mem.opcode == VTA_OPCODE_STORE) {
824 CHECK(c.mem.pop_next_dep == false);
825 CHECK(c.mem.push_next_dep == false);
826 if (c.mem.pop_prev_dep) g2s_queue--;
827 if (c.mem.push_prev_dep) s2g_queue++;
828 } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
829 (c.mem.memory_type == VTA_MEM_ID_INP ||
830 c.mem.memory_type == VTA_MEM_ID_WGT) ) {
831 CHECK(c.mem.pop_prev_dep == false);
832 CHECK(c.mem.push_prev_dep == false);
833 if (c.mem.pop_next_dep) g2l_queue--;
834 if (c.mem.push_next_dep) l2g_queue++;
835 } else {
836 if (c.mem.pop_prev_dep) l2g_queue--;
837 if (c.mem.push_prev_dep) g2l_queue++;
838 if (c.mem.pop_next_dep) s2g_queue--;
839 if (c.mem.push_next_dep) g2s_queue++;
840 }
841 } else if (c.mem.opcode == VTA_OPCODE_GEMM ||
842 c.mem.opcode == VTA_OPCODE_ALU) {
843 // Print instruction field information
844 if (c.gemm.pop_prev_dep) l2g_queue--;
845 if (c.gemm.push_prev_dep) g2l_queue++;
846 if (c.gemm.pop_next_dep) s2g_queue--;
847 if (c.gemm.push_next_dep) g2s_queue++;
848 }
849 printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
850 printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
851 }
852 }
853 // Commit all pending pop of corresponding stage
CommitPendingPop(int stage)854 void CommitPendingPop(int stage) {
855 // Handle the LD<->compute queue
856 // NOTE: pop executes on target(stage)
857 CHECK(stage > 0 && stage < 4);
858 if (pending_pop_prev_[stage] ||
859 pending_pop_next_[stage]) {
860 PushNoop(stage, false, false,
861 pending_pop_prev_[stage],
862 pending_pop_next_[stage]);
863 pending_pop_prev_[stage] = 0;
864 pending_pop_next_[stage] = 0;
865 }
866 }
CommitPending()867 void CommitPending() {
868 for (int i = kLoadStage; i <= kStoreStage; ++i) {
869 CommitPendingPop(i);
870 }
871 }
PendingPop()872 bool PendingPop() {
873 for (int i = kLoadStage; i <= kStoreStage; ++i) {
874 if (pending_pop_prev_[i]) return true;
875 if (pending_pop_next_[i]) return true;
876 }
877 return false;
878 }
AutoReadBarrier()879 void AutoReadBarrier() {
880 ReadBarrier();
881 }
882 /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
ReadBarrier()883 void ReadBarrier() {
884 CHECK(fpga_buff_ != nullptr);
885 CHECK(fpga_buff_phy_);
886 uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
887 CHECK(buff_size <= kMaxBytes);
888 // Copy contents of DRAM buffer to FPGA buff
889 VTAMemCopyFromHost(fpga_buff_,
890 dram_buffer_.data(),
891 buff_size);
892 // Flush if we're using a shared memory system
893 // and if interface is non-coherent
894 if (!coherent_ && always_cache_) {
895 VTAFlushCache(fpga_buff_,
896 fpga_buff_phy_,
897 buff_size);
898 }
899 }
900
901 protected:
902 /*! \return Add new instruction to the buffer. */
NextInsn()903 VTAGenericInsn* NextInsn() {
904 VTAGenericInsn insn;
905 dram_buffer_.push_back(insn);
906 return &dram_buffer_.back();
907 }
908 // Create a new instruction for a given stage
Create(PipelineStage stage)909 VTAGenericInsn* Create(PipelineStage stage) {
910 VTAGenericInsn* gptr = NextInsn();
911 VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(gptr);
912 mptr->pop_prev_dep = pending_pop_prev_[stage];
913 mptr->pop_next_dep = pending_pop_next_[stage];
914 mptr->push_prev_dep = false;
915 mptr->push_next_dep = false;
916 pending_pop_prev_[stage] = 0;
917 pending_pop_next_[stage] = 0;
918 return gptr;
919 }
920 // Get stage of the memory
GetMemPipelineStage(int memory_type)921 static PipelineStage GetMemPipelineStage(int memory_type) {
922 if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
923 if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
924 return kLoadStage;
925 }
926 // Get stage of the computation
GetPipelineStage(VTAMemInsn * insn)927 static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
928 if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage;
929 if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
930 if (insn->opcode == VTA_OPCODE_LOAD) {
931 if (insn->x_size == 0) return kNoneStage;
932 if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
933 if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
934 return kLoadStage;
935 }
936 if (insn->opcode == VTA_OPCODE_STORE) {
937 // FIXME: Right now memory_type is a 2-bit field which means that
938 // VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
939 // checking the memory_type to avoid an CHECK error...
940 return kStoreStage;
941 }
942 LOG(FATAL) << "not reached";
943 return kNoneStage;
944 }
945
946 // Get stage of memory and computation
GetPipelineStageAll(VTAMemInsn * insn)947 static PipelineStage GetPipelineStageAll(VTAMemInsn* insn) {
948 PipelineStage stage = GetPipelineStage(insn);
949 if (stage != kNoneStage) return stage;
950 return GetMemPipelineStage(insn->memory_type);
951 }
952
953 // Push no-op
PushNoop(int stage,bool push_prev_dep,bool push_next_dep,bool pop_prev_dep,bool pop_next_dep)954 void PushNoop(int stage,
955 bool push_prev_dep, bool push_next_dep,
956 bool pop_prev_dep, bool pop_next_dep) {
957 VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
958 insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD);
959 insn->push_prev_dep = push_prev_dep;
960 insn->push_next_dep = push_next_dep;
961 insn->pop_prev_dep = pop_prev_dep;
962 insn->pop_next_dep = pop_next_dep;
963 insn->sram_base = 0;
964 insn->dram_base = 0;
965 insn->y_size = 0;
966 insn->x_size = 0;
967 insn->x_stride = 0;
968 insn->y_pad_0 = 0;
969 insn->y_pad_1 = 0;
970 insn->x_pad_0 = 0;
971 insn->x_pad_1 = 0;
972 insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP);
973 }
974
975 private:
976 // Pending pop of each isntruction queue, qid=0 is not used
977 int pending_pop_prev_[4];
978 int pending_pop_next_[4];
979 static constexpr int kElemBytes = sizeof(VTAGenericInsn);
980 static constexpr int kMaxElems = kMaxBytes / kElemBytes;
981 };
982
983 /*!
984 * \brief The command queue object that handles the request.
985 */
986 class CommandQueue {
987 public:
CommandQueue()988 CommandQueue() {
989 this->InitSpace();
990 }
InitSpace()991 void InitSpace() {
992 uop_queue_.InitSpace();
993 insn_queue_.InitSpace();
994 device_ = VTADeviceAlloc();
995 CHECK(device_ != nullptr);
996 }
997
~CommandQueue()998 ~CommandQueue() {
999 VTADeviceFree(device_);
1000 }
1001
GetElemBytes(uint32_t memory_id)1002 uint32_t GetElemBytes(uint32_t memory_id) {
1003 uint32_t elem_bytes = 0;
1004 switch (memory_id) {
1005 case VTA_MEM_ID_UOP:
1006 elem_bytes = VTA_UOP_ELEM_BYTES;
1007 break;
1008 case VTA_MEM_ID_INP:
1009 elem_bytes = VTA_INP_ELEM_BYTES;
1010 break;
1011 case VTA_MEM_ID_WGT:
1012 elem_bytes = VTA_WGT_ELEM_BYTES;
1013 break;
1014 case VTA_MEM_ID_ACC:
1015 elem_bytes = VTA_ACC_ELEM_BYTES;
1016 break;
1017 case VTA_MEM_ID_OUT:
1018 elem_bytes = VTA_OUT_ELEM_BYTES;
1019 break;
1020 default:
1021 LOG(FATAL) << "Memory id not recognized:" << memory_id;
1022 break;
1023 }
1024 /*
1025 * elements size should not larger than VTA_PAGE_BYTES.
1026 *
1027 */
1028 CHECK_GE(VTA_PAGE_BYTES, elem_bytes);
1029 return elem_bytes;
1030 }
1031
LoadBuffer2D(void * src_dram_addr,uint32_t src_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride,uint32_t x_pad_before,uint32_t y_pad_before,uint32_t x_pad_after,uint32_t y_pad_after,uint32_t dst_sram_index,uint32_t dst_memory_type)1032 void LoadBuffer2D(void* src_dram_addr,
1033 uint32_t src_elem_offset,
1034 uint32_t x_size,
1035 uint32_t y_size,
1036 uint32_t x_stride,
1037 uint32_t x_pad_before,
1038 uint32_t y_pad_before,
1039 uint32_t x_pad_after,
1040 uint32_t y_pad_after,
1041 uint32_t dst_sram_index,
1042 uint32_t dst_memory_type) {
1043 VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
1044 insn->opcode = VTA_OPCODE_LOAD;
1045 insn->memory_type = dst_memory_type;
1046 insn->sram_base = dst_sram_index;
1047 DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
1048 insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
1049 insn->y_size = y_size;
1050 insn->x_size = x_size;
1051 insn->x_stride = x_stride;
1052 insn->y_pad_0 = y_pad_before;
1053 insn->y_pad_1 = y_pad_after;
1054 insn->x_pad_0 = x_pad_before;
1055 insn->x_pad_1 = x_pad_after;
1056 this->CheckInsnOverFlow();
1057 }
1058
StoreBuffer2D(uint32_t src_sram_index,uint32_t src_memory_type,void * dst_dram_addr,uint32_t dst_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride)1059 void StoreBuffer2D(uint32_t src_sram_index,
1060 uint32_t src_memory_type,
1061 void* dst_dram_addr,
1062 uint32_t dst_elem_offset,
1063 uint32_t x_size,
1064 uint32_t y_size,
1065 uint32_t x_stride) {
1066 VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
1067 insn->opcode = VTA_OPCODE_STORE;
1068 insn->memory_type = src_memory_type;
1069 insn->sram_base = src_sram_index;
1070 DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
1071 insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
1072 insn->y_size = y_size;
1073 insn->x_size = x_size;
1074 insn->x_stride = x_stride;
1075 insn->y_pad_0 = 0;
1076 insn->y_pad_1 = 0;
1077 insn->x_pad_0 = 0;
1078 insn->x_pad_1 = 0;
1079 this->CheckInsnOverFlow();
1080 }
1081
DepPush(int from_qid,int to_qid)1082 void DepPush(int from_qid, int to_qid) {
1083 insn_queue_.DepPush(from_qid, to_qid);
1084 }
1085
DepPop(int from_qid,int to_qid)1086 void DepPop(int from_qid, int to_qid) {
1087 insn_queue_.DepPop(from_qid, to_qid);
1088 }
1089
ReadBarrier(void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1090 void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
1091 if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) {
1092 uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
1093 DataBuffer::FromHandle(buffer)->FlushCache(
1094 elem_bytes * start, elem_bytes * extent);
1095 }
1096 }
1097
WriteBarrier(void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1098 void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
1099 if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) {
1100 uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
1101 DataBuffer::FromHandle(buffer)->InvalidateCache(
1102 elem_bytes * start, elem_bytes * extent);
1103 }
1104 }
1105
Synchronize(uint32_t wait_cycles)1106 void Synchronize(uint32_t wait_cycles) {
1107 // Insert dependences to force serialization
1108 if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
1109 insn_queue_.RewriteForceSerial();
1110 } else {
1111 // This will issue finish after last store finishes
1112 insn_queue_.DepPush(kStoreStage, kComputeStage);
1113 insn_queue_.DepPush(kLoadStage, kComputeStage);
1114 insn_queue_.DepPop(kStoreStage, kComputeStage);
1115 insn_queue_.DepPop(kLoadStage, kComputeStage);
1116 insn_queue_.CommitPendingPop(kComputeStage);
1117 }
1118 // NOTE: FINISH cannot contain pop
1119 VTAGemInsn* insn = insn_queue_.CreateGemInsn();
1120 insn->opcode = VTA_OPCODE_FINISH;
1121 CHECK(!insn_queue_.PendingPop());
1122 // Check if there are no instruction to execute at all
1123 if (insn_queue_.count() == 0) return;
1124 // Synchronization for the queues
1125 uop_queue_.AutoReadBarrier();
1126 insn_queue_.AutoReadBarrier();
1127 // Dump instructions if debug enabled
1128 if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
1129 insn_queue_.DumpInsn();
1130 }
1131 // Make sure that the last instruction is a finish instruction
1132 CHECK(reinterpret_cast<VTAMemInsn*>(
1133 insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
1134
1135 // Make sure that we don't exceed contiguous physical memory limits
1136 CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
1137 int timeout = VTADeviceRun(
1138 device_,
1139 insn_queue_.dram_phy_addr(),
1140 insn_queue_.count(),
1141 wait_cycles);
1142 CHECK_EQ(timeout, 0);
1143 // Reset buffers
1144 uop_queue_.Reset();
1145 insn_queue_.Reset();
1146 }
1147
1148 // Get record kernel
record_kernel() const1149 UopKernel* record_kernel() const {
1150 CHECK(record_kernel_ != nullptr);
1151 return record_kernel_;
1152 }
1153
1154 // Set debug flag
SetDebugFlag(int debug_flag)1155 void SetDebugFlag(int debug_flag) {
1156 debug_flag_ = debug_flag;
1157 }
1158
PushGEMMOp(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1159 void PushGEMMOp(void** uop_handle,
1160 int (*finit)(void*),
1161 void* signature,
1162 int nbytes) {
1163 UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
1164 if (uptr[0] == nullptr) {
1165 uptr[0] = new UopKernelMap();
1166 }
1167 UopKernel** kptr = uptr[0]->Get(signature, nbytes);
1168 if (kptr[0] == nullptr) {
1169 record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
1170 CHECK_EQ((*finit)(signature), 0);
1171 kptr[0] = static_cast<UopKernel*>(record_kernel_);
1172 if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
1173 record_kernel_->Dump();
1174 }
1175 record_kernel_ = nullptr;
1176 }
1177 this->PushGEMMOp(static_cast<UopKernel*>(kptr[0]));
1178 this->CheckInsnOverFlow();
1179 }
1180
PushALUUop(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1181 void PushALUUop(void** uop_handle,
1182 int (*finit)(void*),
1183 void* signature,
1184 int nbytes) {
1185 UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
1186 if (uptr[0] == nullptr) {
1187 uptr[0] = new UopKernelMap();
1188 }
1189 UopKernel** kptr = uptr[0]->Get(signature, nbytes);
1190 if (kptr[0] == nullptr) {
1191 record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
1192 CHECK_EQ((*finit)(signature), 0);
1193 kptr[0] = static_cast<UopKernel*>(record_kernel_);
1194 if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
1195 record_kernel_->Dump();
1196 }
1197 record_kernel_ = nullptr;
1198 }
1199 this->PushALUUop(static_cast<UopKernel*>(kptr[0]));
1200 this->CheckInsnOverFlow();
1201 }
1202
ThreadLocal()1203 static std::shared_ptr<CommandQueue>& ThreadLocal() {
1204 static std::shared_ptr<CommandQueue> inst =
1205 std::make_shared<CommandQueue>();
1206 if (inst == nullptr) {
1207 inst = std::make_shared<CommandQueue>();
1208 }
1209 return inst;
1210 }
1211
Shutdown()1212 static void Shutdown() {
1213 ThreadLocal().reset();
1214 }
1215
1216 private:
1217 // Push GEMM uop to the command buffer
PushGEMMOp(UopKernel * kernel)1218 void PushGEMMOp(UopKernel* kernel) {
1219 uop_queue_.Push(kernel,
1220 [this]() { this->AutoSync(); });
1221 if (uop_queue_.pending()) {
1222 VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
1223 insn->opcode = VTA_OPCODE_LOAD;
1224 uop_queue_.FlushUopLoad(insn);
1225 }
1226 VTAGemInsn* insn = insn_queue_.CreateGemInsn();
1227 insn->opcode = VTA_OPCODE_GEMM;
1228 insn->reset_reg = kernel->reset_out_;
1229 insn->uop_bgn = kernel->sram_begin_;
1230 insn->uop_end = kernel->sram_end_;
1231 const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
1232 if (loop.size() > 0) {
1233 insn->iter_out = loop[0].extent;
1234 insn->wgt_factor_out = loop[0].wgt_factor;
1235 insn->src_factor_out = loop[0].src_factor;
1236 insn->dst_factor_out = loop[0].dst_factor;
1237 } else {
1238 insn->iter_out = 1;
1239 insn->wgt_factor_out = 0;
1240 insn->src_factor_out = 0;
1241 insn->dst_factor_out = 0;
1242 }
1243 if (loop.size() > 1) {
1244 insn->iter_in = loop[1].extent;
1245 insn->wgt_factor_in = loop[1].wgt_factor;
1246 insn->src_factor_in = loop[1].src_factor;
1247 insn->dst_factor_in = loop[1].dst_factor;
1248 } else {
1249 insn->iter_in = 1;
1250 insn->wgt_factor_in = 0;
1251 insn->src_factor_in = 0;
1252 insn->dst_factor_in = 0;
1253 }
1254 }
1255
1256 // Push ALU uop to the command buffer
PushALUUop(UopKernel * kernel)1257 void PushALUUop(UopKernel* kernel) {
1258 uop_queue_.Push(kernel,
1259 [this]() { this->AutoSync(); });
1260 if (uop_queue_.pending()) {
1261 VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
1262 insn->opcode = VTA_OPCODE_LOAD;
1263 uop_queue_.FlushUopLoad(insn);
1264 }
1265 VTAAluInsn* insn = insn_queue_.CreateAluInsn();
1266 insn->opcode = VTA_OPCODE_ALU;
1267 insn->reset_reg = kernel->reset_out_;
1268 insn->uop_bgn = kernel->sram_begin_;
1269 insn->uop_end = kernel->sram_end_;
1270 insn->alu_opcode = kernel->opcode_;
1271 insn->use_imm = kernel->use_imm_;
1272 insn->imm = kernel->imm_val_;
1273 const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
1274 if (loop.size() == 0) {
1275 insn->iter_out = 1;
1276 insn->dst_factor_out = 0;
1277 insn->src_factor_out = 0;
1278 insn->iter_in = 1;
1279 insn->dst_factor_in = 0;
1280 insn->src_factor_in = 0;
1281 } else if (loop.size() == 1) {
1282 insn->iter_out = 1;
1283 insn->dst_factor_out = 0;
1284 insn->src_factor_out = 0;
1285 insn->iter_in = loop[0].extent;
1286 insn->dst_factor_in = loop[0].dst_factor;
1287 insn->src_factor_in = loop[0].src_factor;
1288 } else {
1289 insn->iter_out = loop[0].extent;
1290 insn->dst_factor_out = loop[0].dst_factor;
1291 insn->src_factor_out = loop[0].src_factor;
1292 insn->iter_in = loop[1].extent;
1293 insn->dst_factor_in = loop[1].dst_factor;
1294 insn->src_factor_in = loop[1].src_factor;
1295 }
1296 }
1297
CheckInsnOverFlow()1298 void CheckInsnOverFlow() {
1299 // At each API call, we can at most commit:
1300 // one pending store, one pending load, and one uop
1301 if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
1302 this->AutoSync();
1303 }
1304 }
1305 // Auto sync when instruction overflow
AutoSync()1306 void AutoSync() {
1307 this->Synchronize(1 << 31);
1308 }
1309
1310 // Internal debug flag
1311 int debug_flag_{0};
1312 // The kernel we are currently recording
1313 UopKernel* record_kernel_{nullptr};
1314 // Micro op queue
1315 UopQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> uop_queue_;
1316 // instruction queue
1317 InsnQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> insn_queue_;
1318 // Device handle
1319 VTADeviceHandle device_{nullptr};
1320 };
1321
1322 } // namespace vta
1323
VTABufferAlloc(size_t size)1324 void* VTABufferAlloc(size_t size) {
1325 return vta::DataBuffer::Alloc(size);
1326 }
1327
VTABufferFree(void * buffer)1328 void VTABufferFree(void* buffer) {
1329 vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer));
1330 }
1331
VTABufferCopy(const void * from,size_t from_offset,void * to,size_t to_offset,size_t size,int kind_mask)1332 void VTABufferCopy(const void* from,
1333 size_t from_offset,
1334 void* to,
1335 size_t to_offset,
1336 size_t size,
1337 int kind_mask) {
1338 vta::DataBuffer* from_buffer = nullptr;
1339 vta::DataBuffer* to_buffer = nullptr;
1340
1341 if (kind_mask & 2) {
1342 from_buffer = vta::DataBuffer::FromHandle(from);
1343 from = from_buffer->virt_addr();
1344 }
1345 if (kind_mask & 1) {
1346 to_buffer = vta::DataBuffer::FromHandle(to);
1347 to = to_buffer->virt_addr();
1348 }
1349
1350 if (from_buffer) {
1351 // This is an FPGA to host mem transfer
1352 from_buffer->InvalidateCache(from_offset, size);
1353 from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
1354 static_cast<const char*>(from) + from_offset,
1355 size);
1356 } else if (to_buffer) {
1357 // This is a host to FPGA mem transfer
1358 to_buffer->MemCopyFromHost(static_cast<char*>(to) + to_offset,
1359 static_cast<const char*>(from) + from_offset,
1360 size);
1361 to_buffer->FlushCache(to_offset, size);
1362 }
1363 }
1364
VTATLSCommandHandle()1365 VTACommandHandle VTATLSCommandHandle() {
1366 return vta::CommandQueue::ThreadLocal().get();
1367 }
1368
VTARuntimeShutdown()1369 void VTARuntimeShutdown() {
1370 vta::CommandQueue::Shutdown();
1371 }
1372
VTASetDebugMode(VTACommandHandle cmd,int debug_flag)1373 void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
1374 static_cast<vta::CommandQueue*>(cmd)->
1375 SetDebugFlag(debug_flag);
1376 }
1377
VTABufferCPUPtr(VTACommandHandle cmd,void * buffer)1378 void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
1379 return vta::DataBuffer::FromHandle(buffer)->virt_addr();
1380 }
1381
VTAWriteBarrier(VTACommandHandle cmd,void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1382 void VTAWriteBarrier(VTACommandHandle cmd,
1383 void* buffer,
1384 uint32_t elem_bits,
1385 uint32_t start,
1386 uint32_t extent) {
1387 static_cast<vta::CommandQueue*>(cmd)->
1388 WriteBarrier(buffer, elem_bits, start, extent);
1389 }
1390
VTAReadBarrier(VTACommandHandle cmd,void * buffer,uint32_t elem_bits,uint32_t start,uint32_t extent)1391 void VTAReadBarrier(VTACommandHandle cmd,
1392 void* buffer,
1393 uint32_t elem_bits,
1394 uint32_t start,
1395 uint32_t extent) {
1396 static_cast<vta::CommandQueue*>(cmd)->
1397 ReadBarrier(buffer, elem_bits, start, extent);
1398 }
1399
VTALoadBuffer2D(VTACommandHandle cmd,void * src_dram_addr,uint32_t src_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride,uint32_t x_pad_before,uint32_t y_pad_before,uint32_t x_pad_after,uint32_t y_pad_after,uint32_t dst_sram_index,uint32_t dst_memory_type)1400 void VTALoadBuffer2D(VTACommandHandle cmd,
1401 void* src_dram_addr,
1402 uint32_t src_elem_offset,
1403 uint32_t x_size,
1404 uint32_t y_size,
1405 uint32_t x_stride,
1406 uint32_t x_pad_before,
1407 uint32_t y_pad_before,
1408 uint32_t x_pad_after,
1409 uint32_t y_pad_after,
1410 uint32_t dst_sram_index,
1411 uint32_t dst_memory_type) {
1412 static_cast<vta::CommandQueue*>(cmd)->
1413 LoadBuffer2D(src_dram_addr, src_elem_offset,
1414 x_size, y_size, x_stride,
1415 x_pad_before, y_pad_before,
1416 x_pad_after, y_pad_after,
1417 dst_sram_index, dst_memory_type);
1418 }
1419
VTAStoreBuffer2D(VTACommandHandle cmd,uint32_t src_sram_index,uint32_t src_memory_type,void * dst_dram_addr,uint32_t dst_elem_offset,uint32_t x_size,uint32_t y_size,uint32_t x_stride)1420 void VTAStoreBuffer2D(VTACommandHandle cmd,
1421 uint32_t src_sram_index,
1422 uint32_t src_memory_type,
1423 void* dst_dram_addr,
1424 uint32_t dst_elem_offset,
1425 uint32_t x_size,
1426 uint32_t y_size,
1427 uint32_t x_stride) {
1428 static_cast<vta::CommandQueue*>(cmd)->
1429 StoreBuffer2D(src_sram_index, src_memory_type,
1430 dst_dram_addr, dst_elem_offset,
1431 x_size, y_size, x_stride);
1432 }
1433
VTAUopPush(uint32_t mode,uint32_t reset_out,uint32_t dst_index,uint32_t src_index,uint32_t wgt_index,uint32_t opcode,uint32_t use_imm,int32_t imm_val)1434 void VTAUopPush(uint32_t mode,
1435 uint32_t reset_out,
1436 uint32_t dst_index,
1437 uint32_t src_index,
1438 uint32_t wgt_index,
1439 uint32_t opcode,
1440 uint32_t use_imm,
1441 int32_t imm_val) {
1442 vta::CommandQueue::ThreadLocal()->record_kernel()
1443 ->Push(mode, reset_out, dst_index, src_index,
1444 wgt_index, opcode, use_imm, imm_val);
1445 }
1446
VTAUopLoopBegin(uint32_t extent,uint32_t dst_factor,uint32_t src_factor,uint32_t wgt_factor)1447 void VTAUopLoopBegin(uint32_t extent,
1448 uint32_t dst_factor,
1449 uint32_t src_factor,
1450 uint32_t wgt_factor) {
1451 vta::CommandQueue::ThreadLocal()->record_kernel()
1452 ->PushLoopBegin(extent, dst_factor, src_factor, wgt_factor);
1453 }
1454
VTAUopLoopEnd()1455 void VTAUopLoopEnd() {
1456 vta::CommandQueue::ThreadLocal()->record_kernel()
1457 ->PushLoopEnd();
1458 }
1459
VTAPushGEMMOp(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1460 int VTAPushGEMMOp(void** uop_handle,
1461 int (*finit)(void*),
1462 void* signature,
1463 int nbytes) {
1464 vta::CommandQueue::ThreadLocal()->
1465 PushGEMMOp(uop_handle, finit, signature, nbytes);
1466 return 0;
1467 }
1468
VTAPushALUOp(void ** uop_handle,int (* finit)(void *),void * signature,int nbytes)1469 int VTAPushALUOp(void** uop_handle,
1470 int (*finit)(void*),
1471 void* signature,
1472 int nbytes) {
1473 vta::CommandQueue::ThreadLocal()->
1474 PushALUUop(uop_handle, finit, signature, nbytes);
1475 return 0;
1476 }
1477
VTADepPush(VTACommandHandle cmd,int from_qid,int to_qid)1478 int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) {
1479 static_cast<vta::CommandQueue*>(cmd)->
1480 DepPush(from_qid, to_qid);
1481 return 0;
1482 }
1483
VTADepPop(VTACommandHandle cmd,int from_qid,int to_qid)1484 int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
1485 static_cast<vta::CommandQueue*>(cmd)->
1486 DepPop(from_qid, to_qid);
1487 return 0;
1488 }
1489
VTASynchronize(VTACommandHandle cmd,uint32_t wait_cycles)1490 void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
1491 static_cast<vta::CommandQueue*>(cmd)->
1492 Synchronize(wait_cycles);
1493 }
1494