te/operation/hybrid_op.cc

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \brief Hybrid computation rule.
 * \file hybrid_op.cc
 */
#include "hybrid_op.h"

#include <tvm/arith/analyzer.h>
#include <tvm/runtime/registry.h>
#include <tvm/te/operation.h>
#include <tvm/tir/analysis.h>
#include <tvm/tir/expr.h>
#include <tvm/tir/op.h>
#include <tvm/tir/stmt_functor.h>

#include <string>
#include <unordered_set>
#include <utility>

#include "op_util.h"

namespace tvm {
namespace te {
using namespace tir;
// HybridOpNode
TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
    .set_dispatch<HybridOpNode>([](const ObjectRef& node, ReprPrinter* p) {
      auto* op = static_cast<const HybridOpNode*>(node.get());
      p->stream << "hybrid(" << op->name << ", " << op << ")";
    });

TVM_REGISTER_NODE_TYPE(HybridOpNode);

int HybridOpNode::num_outputs() const { return static_cast<int>(outputs.size()); }

Array<IterVar> HybridOpNode::root_iter_vars() const { return this->axis; }

DataType HybridOpNode::output_dtype(size_t i) const { return outputs[i]->dtype; }

Array<PrimExpr> HybridOpNode::output_shape(size_t i) const { return outputs[i]->shape; }

HybridOp::HybridOp(std::string name, std::string tag, Map<String, ObjectRef> attrs,
                   Array<Tensor> inputs, Array<Tensor> outputs, Stmt body) {
  if (!attrs.defined()) {
    attrs = Map<String, ObjectRef>();
  }
  auto n = make_object<HybridOpNode>();
  n->name = std::move(name);
  n->tag = std::move(tag);
  n->attrs = std::move(attrs);
  n->inputs = std::move(inputs);
  n->outputs = std::move(outputs);
  n->axis = te::GatherLoopVars(body);
  n->body = std::move(body);
  data_ = std::move(n);
}

TVM_REGISTER_GLOBAL("te.HybridOp")
    .set_body_typed([](std::string name, std::string tag, Map<String, ObjectRef> attrs,
                       Array<Tensor> inputs, Array<Tensor> outputs,
                       Stmt body) { return HybridOp(name, tag, attrs, inputs, outputs, body); });

Array<Tensor> HybridOpNode::InputTensors() const {
  // Because input tensors could be potentially inlined into hybrid scripts,
  // we need to check if all input tensors are used in the body.
  std::unordered_set<Tensor> orig_inputs;
  for (auto t : inputs) {
    orig_inputs.insert(t);
  }
  std::unordered_set<Tensor> visited;
  Array<Tensor> curr_inputs;
  tir::PostOrderVisit(body, [&curr_inputs, &orig_inputs, &visited](const ObjectRef& n) {
    if (auto* pload = n.as<tir::ProducerLoadNode>()) {
      Tensor t = Downcast<Tensor>(pload->producer);
      if (orig_inputs.count(t) && !visited.count(t)) {
        curr_inputs.push_back(t);
        visited.insert(t);
      }
    }
  });
  return curr_inputs;
}

Operation HybridOpNode::ReplaceInputs(const Operation& self,
                                      const std::unordered_map<Tensor, Tensor>& rmap) const {
  CHECK_EQ(self.operator->(), this);
  auto n = make_object<HybridOpNode>(*this);
  n->body = te::ReplaceTensor(this->body, rmap);
  for (size_t i = 0; i < n->inputs.size(); ++i) {
    Tensor t = n->inputs[i];
    if (rmap.count(t)) {
      n->inputs.Set(i, rmap.at(t));
    }
  }

  if (body.same_as(n->body) && inputs.same_as(n->inputs)) {
    return self;
  } else {
    return Operation(n);
  }
}

void HybridOpNode::PropBoundToInputs(const Operation& self, arith::Analyzer* analyzer,
                                     const std::unordered_map<const VarNode*, IntSet>& dom_map,
                                     std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
  auto curr_inputs = InputTensors();
  for (Tensor t : curr_inputs) {
    auto it = out_dom_map->find(t);
    if (it == out_dom_map->end()) continue;
    TensorDom& dom = it->second;
    for (size_t i = 0; i < t->shape.size(); ++i) {
      dom.data[i].emplace_back(
          IntSet::FromRange(Range::FromMinExtent(make_const(t->shape[i].dtype(), 0), t->shape[i])));
    }
  }
}

void HybridOpNode::GatherBound(const Operation& self,
                               const std::unordered_map<Tensor, TensorDom>& tensor_dom,
                               std::unordered_map<IterVar, Range>* out_dom_map) const {
  for (auto iter_var : axis) {
    CHECK(!out_dom_map->count(iter_var));
    out_dom_map->operator[](iter_var) = iter_var->dom;
  }
}

Stmt HybridOpNode::BuildRealize(const Stage& stage,
                                const std::unordered_map<IterVar, Range>& realize_map,
                                const Stmt& body) const {
  // TODO(@were): Add attribute inject here and remove it from hybrid parser.
  CHECK_EQ(stage->op.get(), this);
  Stmt realize_body = body;
  for (int k = 0; k < num_outputs(); ++k) {
    Tensor t = stage->op.output(k);
    Region bounds;
    for (size_t i = 0; i < t->shape.size(); ++i) {
      bounds.push_back(Range::FromMinExtent(make_const(t->shape[i].dtype(), 0), t->shape[i]));
    }
    realize_body = tir::ProducerRealize(t, bounds, const_true(), realize_body);
  }
  return realize_body;
}

Stmt HybridOpNode::BuildProvide(const Stage& stage,
                                const std::unordered_map<IterVar, Range>& dom_map,
                                bool debug_keep_trivial_loop) const {
  CHECK_EQ(stage->op.operator->(), this);
  Stmt ret = AttrStmt(make_zero(DataType::Int(32)), tir::attr::extern_scope, 0, this->body);
  std::unordered_map<Tensor, Tensor> rmap;
  for (int i = 0; i < this->num_outputs(); ++i) {
    rmap[outputs[i]] = stage->op.output(i);
  }
  auto n = make_object<HybridOpNode>(*this);
  /* This is a story little bit complicated.
   * The following two lines of codes replace output tensors' usage.
   * This is the simplest way I (@were) can come up with to glue
   * hybrid operation node to TVM op system.
   * In hybrid script all the tensors, especially the output tensors,
   * have their own names defined by the users. However, In TVM
   * conventional ops:
   *   1. Output tensors refer the corresponding op node so that the output
   *      tensors have the same names as the operation produces them.
   *   2. Once OpNode is wrapped up by an Operation node, it is finalized.
   *      Later access will be from a const OpNode*.
   * This is a chicken-egg paradox. It is impossible to put the output
   * tensors into the function body without forming the op node. The
   * function body is immutable after the node is formed.
   *
   * Finally, I decided to resolve this issue "lazily". During the
   * pipeline of compilation, this stage is a very preliminary stage.
   * Technically, it is before Phase 0. The actual tensors will be replaced
   * here.
   * Thus, the operation body is slightly different from the Phase 0 body.
   * This is a major difference that HybridOpNode is NOT the same as
   * ExternOpNode.
   * */
  ret = te::ReplaceTensor(ret, rmap);
  ret = te::ReplaceProvideTensor(ret, rmap);

  ret = te::ApplySchedule(stage, dom_map, ret);
  return ret;
}

Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                     Stmt stmt) {
  class LoopSpliter : public StmtExprMutator {
    PrimExpr factor;
    const VarNode* parent;
    IterVar inner, outer;

   public:
    bool splitted;
    LoopSpliter(const SplitNode* split, const std::unordered_map<IterVar, Range>& dom_map)
        : factor(split->factor), splitted(false) {
      parent = split->parent->var.get();

      auto& inner_ = split->inner;
      CHECK(dom_map.count(inner_));
      auto& inner_dom = dom_map.find(inner_)->second;
      CHECK(is_const_int(inner_dom->min, 0));

      auto& outer_ = split->outer;
      CHECK(dom_map.count(outer_));
      auto& outer_dom = dom_map.find(outer_)->second;
      CHECK(is_const_int(outer_dom->min, 0));

      inner = IterVar(inner_dom, inner_->var, inner_->iter_type);
      outer = IterVar(outer_dom, outer_->var, outer_->iter_type);
    }

    Stmt VisitStmt_(const ForNode* op) final {
      if (op->loop_var.get() == parent) {
        std::unordered_map<const VarNode*, PrimExpr> rmap;
        rmap[op->loop_var.get()] = inner + outer * factor;
        Stmt ret = tir::Substitute(op->body, rmap);
        PrimExpr cond = likely(outer * factor < (op->extent - inner));
        ret = IfThenElse(cond, ret);
        ret = For(inner->var, PrimExpr(0), inner->dom->extent,
                  IterVarTypeToForType(inner->iter_type), op->device_api, ret);
        ret = For(outer->var, PrimExpr(0), outer->dom->extent,
                  IterVarTypeToForType(outer->iter_type), op->device_api, ret);
        splitted = true;
        return ret;
      }
      return StmtExprMutator::VisitStmt_(op);
    }
  };

  class LoopFuser : public StmtExprMutator {
    const IterVar& parent;
    const VarNode* inner;
    const VarNode* outer;
    bool under_outer;
    PrimExpr extent;

   public:
    bool fused;
    explicit LoopFuser(const FuseNode* fuse_)
        : parent(fuse_->fused),
          inner(fuse_->inner->var.get()),
          outer(fuse_->outer->var.get()),
          under_outer(false),
          extent(0),
          fused(false) {}

    // TODO(@were): Handle imperfect loops
    Stmt VisitStmt_(const ForNode* op) final {
      if (op->loop_var.get() == inner) {
        CHECK(under_outer);
        std::unordered_map<const VarNode*, PrimExpr> rmap;
        rmap[op->loop_var.get()] = indexmod(parent, op->extent);
        extent = op->extent;
        fused = true;
        return tir::Substitute(op->body, rmap);
      } else if (op->loop_var.get() == outer) {
        under_outer = true;
        Stmt body = this->VisitStmt(op->body);
        std::unordered_map<const VarNode*, PrimExpr> rmap;
        rmap[op->loop_var.get()] = indexdiv(parent, extent);
        body = tir::Substitute(body, rmap);
        under_outer = false;
        return For(parent->var, PrimExpr(0), extent * op->extent, op->for_type, op->device_api,
                   body);
      } else if (under_outer) {
        Stmt body = this->VisitStmt(op->body);
        std::unordered_map<const VarNode*, PrimExpr> rmap;
        rmap[op->loop_var.get()] = indexmod(indexdiv(parent, extent), op->extent);
        body = tir::Substitute(body, rmap);
        extent = extent * op->extent;
        return body;
      }
      return StmtExprMutator::VisitStmt_(op);
    }
  };

  for (auto& rel : stage->relations) {
    if (const SplitNode* split = rel.as<SplitNode>()) {
      LoopSpliter Spliter(split, dom_map);
      stmt = Spliter(stmt);
      CHECK(Spliter.splitted);
    } else if (const FuseNode* fuse = rel.as<FuseNode>()) {
      LoopFuser Fuser(fuse);
      stmt = Fuser(stmt);
      CHECK(Fuser.fused);
    }
  }

  return stmt;
}

Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_map<IterVar, IterVar>& rebased,
                          Stmt stmt) {
  class LoopAnnotator : public StmtMutator {
    const VarNode* var;
    const IterVarAttr& attr;

   public:
    LoopAnnotator(const VarNode* var_, const IterVarAttr& attr_) : var(var_), attr(attr_) {}

    Stmt VisitStmt_(const ForNode* op) final {
      tir::ExprDeepEqual expr_equal;

      if (op->loop_var.get() == var) {
        if (attr->bind_thread.defined()) {
          const auto& iter_var = attr->bind_thread;
          if (iter_var->dom.defined()) {
            CHECK(is_const_int(iter_var->dom->min, 0));
            CHECK(expr_equal(iter_var->dom->extent, op->extent))
                << "Thread extent and loop extent mismatch!\n";
          }
          std::unordered_map<const VarNode*, PrimExpr> rmap;
          rmap[op->loop_var.get()] = iter_var;
          Stmt body = tir::Substitute(op->body, rmap);
          return AttrStmt(iter_var, "thread_extent", op->extent, body);
        } else {
          return For(op->loop_var, op->min, op->extent, IterVarTypeToForType(attr->iter_type),
                     op->device_api, op->body);
        }
      }
      return StmtMutator::VisitStmt_(op);
    }
  };

  for (auto& iter_var : stage->leaf_iter_vars) {
    bool need_change = false;
    int found = 0;

    const IterVar& actual = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
    const VarNode* var = actual->var.get();
    ForType expected = IterVarTypeToForType(iter_var->iter_type);
    IterVarAttr attr;
    if (stage->iter_var_attrs.count(iter_var)) {
      attr = stage->iter_var_attrs[iter_var];
      expected = IterVarTypeToForType(attr->iter_type);
    }

    PostOrderVisit(stmt, [&found, &var, &attr, &expected, &need_change](const ObjectRef& node) {
      if (const ForNode* op = node.as<ForNode>()) {
        if (op->loop_var.get() == var) {
          ++found;
          need_change = expected != op->for_type || (attr.defined() && attr->bind_thread.defined());
        }
      }
    });

    CHECK_EQ(found, 1) << " iter var should be found exactly once!";
    if (need_change) {
      stmt = LoopAnnotator(var, attr)(std::move(stmt));
    }
  }
  return stmt;
}

Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                    const std::unordered_map<IterVar, IterVar>& rebased, Stmt stmt) {
  std::vector<const VarNode*> current_order;
  PostOrderVisit(stmt, [&current_order](const ObjectRef& node) {
    if (const ForNode* op = node.as<ForNode>()) current_order.push_back(op->loop_var.get());
  });
  std::reverse(current_order.begin(), current_order.end());
  auto& required_ord = stage->leaf_iter_vars;
  CHECK_EQ(current_order.size(), required_ord.size()) << "Cannot reorder the loops!";
  std::unordered_map<const VarNode*, IterVar> reorder;
  bool need_reorder = false;
  for (size_t i = 0; i < current_order.size(); ++i) {
    auto& current = current_order[i];
    const IterVar& iter_var = required_ord[i];
    const IterVar& required = rebased.count(iter_var) ? rebased.find(iter_var)->second : iter_var;
    CHECK(required->dom.defined() || dom_map.count(required)) << required << "\n";
    reorder[current] = required;
    if (current != required->var.get()) {
      need_reorder = true;
    }
  }

  class LoopReorder : public StmtMutator {
    const Stage& stage;
    const std::unordered_map<IterVar, Range>& dom_map;
    const std::unordered_map<const VarNode*, IterVar>& reorder;

   public:
    LoopReorder(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                const std::unordered_map<const VarNode*, IterVar>& reorder)
        : stage(stage), dom_map(dom_map), reorder(reorder) {}

    Stmt VisitStmt_(const ForNode* op) final {
      // Reorder from in to out
      Stmt body_ = this->VisitStmt(op->body);
      CHECK(reorder.count(op->loop_var.get()));
      auto target = reorder.find(op->loop_var.get())->second;
      if (body_.same_as(op->body) && op->loop_var.get() == target->var.get())
        return GetRef<Stmt>(op);
      const Stmt& body = op->body.same_as(body_) ? op->body : body_;
      ForType for_type = IterVarTypeToForType(target->iter_type);
      if (stage->iter_var_attrs.count(target)) {
        for_type = IterVarTypeToForType(stage->iter_var_attrs[target]->iter_type);
      }
      const Range& range = target->dom.defined() ? target->dom : dom_map.find(target)->second;
      return For(target->var, range->min, range->extent, for_type, DeviceAPI::None, body);
    }
  };

  if (need_reorder) return LoopReorder(stage, dom_map, reorder)(stmt);

  return stmt;
}

Stmt ApplySchedule(const Stage& stage, const std::unordered_map<IterVar, Range>& dom_map,
                   Stmt stmt) {
  // TODO(@were): Eliminate loop rebase in script parser and move the burden here
  // Gather rebased variables
  std::unordered_map<IterVar, IterVar> rebased;
  for (auto rel : stage->relations) {
    if (const auto* rebase = rel.as<RebaseNode>()) {
      rebased[rebase->rebased] = rebase->parent;
      CHECK(rebase->parent->dom.defined());
      CHECK(dom_map.count(rebase->rebased));
    }
  }
  stmt = ApplyLoopShapes(stage, dom_map, stmt);
  stmt = ApplyLoopOrder(stage, dom_map, rebased, stmt);
  stmt = ApplyLoopAnnotations(stage, rebased, stmt);
  return stmt;
}

std::vector<IterVar> GatherLoopVars(Stmt stmt) {
  // TODO(@were): Write a comprehensive pass to analyze iter var types
  std::vector<IterVar> res_;
  PostOrderVisit(stmt, [&res_](const ObjectRef& node) {
    if (const ForNode* op = node.as<ForNode>()) {
      Var loop_var(op->loop_var);
      Range dom = Range::FromMinExtent(op->min, op->extent);
      res_.push_back(IterVar(dom, loop_var, ForTypeToIterVarType(op->for_type)));
    }
  });
  std::reverse(res_.begin(), res_.end());
  return res_;
}

// replacer to replace tensors' usage in Provide
class ProviderReplacer : public tir::StmtMutator {
 public:
  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap) : vmap_(vmap) {}

  Stmt VisitStmt_(const tir::ProducerStoreNode* op) final {
    Tensor t = Downcast<Tensor>(op->producer);
    auto it = vmap_.find(t);
    if (it != vmap_.end()) {
      Stmt ret = tir::ProducerStore(it->second, op->value, op->indices);
      found = true;
      return this->VisitStmt(ret);
    }
    return StmtMutator::VisitStmt_(op);
  }

  // whether it is found.
  bool found{false};

 private:
  const std::unordered_map<Tensor, Tensor>& vmap_;
};

Stmt ReplaceProvideTensor(Stmt stmt, const std::unordered_map<Tensor, Tensor>& replace) {
  ProviderReplacer repl(replace);
  Stmt ret = repl(stmt);
  return repl.found ? ret : stmt;
}
}  // namespace te
}  // namespace tvm