()) { if (const Broadcast *b = div->b.as()) { if (is_positive_const(b->value)) { Interval ia = bounds_of_lanes(div->a); return {ia.min / b->value, ia.max / b->value}; } else if (is_negative_const(b->value)) { Interval ia = bounds_of_lanes(div->a); return {ia.max / b->value, ia.min / b->value}; } } } else if (const And *and_ = e.as()) { if (const Broadcast *b = and_->b.as()) { Interval ia = bounds_of_lanes(and_->a); return {ia.min && b->value, ia.max && b->value}; } else if (const Broadcast *b = and_->a.as()) { Interval ia = bounds_of_lanes(and_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Or *or_ = e.as()) { if (const Broadcast *b = or_->b.as()) { Interval ia = bounds_of_lanes(or_->a); return {ia.min && b->value, ia.max && b->value}; } else if (const Broadcast *b = or_->a.as()) { Interval ia = bounds_of_lanes(or_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Min *min = e.as()) { if (const Broadcast *b = min->b.as()) { Interval ia = bounds_of_lanes(min->a); return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } else if (const Broadcast *b = min->a.as()) { Interval ia = bounds_of_lanes(min->b); return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } } else if (const Max *max = e.as()) { if (const Broadcast *b = max->b.as()) { Interval ia = bounds_of_lanes(max->a); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } else if (const Broadcast *b = max->a.as()) { Interval ia = bounds_of_lanes(max->b); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } } else if (const Not *not_ = e.as()) { Interval ia = bounds_of_lanes(not_->a); return {!ia.max, !ia.min}; } else if (const Ramp *r = e.as()) { Expr last_lane_idx = make_const(r->base.type(), r->lanes - 1); if (is_positive_const(r->stride)) { return {r->base, r->base + last_lane_idx * r->stride}; } else if (is_negative_const(r->stride)) { return {r->base + last_lane_idx * r->stride, r->base}; } } else if (const Broadcast *b = e.as()) { return {b->value, b->value}; } else if (const Variable *var = e.as()) { return {Variable::make(var->type.element_of(), var->name + ".min_lane"), Variable::make(var->type.element_of(), var->name + ".max_lane")}; } else if (const Let *let = e.as()) { Interval ia = bounds_of_lanes(let->value); Interval ib = bounds_of_lanes(let->body); if (expr_uses_var(ib.min, let->name + ".min_lane")) { ib.min = Let::make(let->name + ".min_lane", ia.min, ib.min); } if (expr_uses_var(ib.max, let->name + ".min_lane")) { ib.max = Let::make(let->name + ".min_lane", ia.min, ib.max); } if (expr_uses_var(ib.min, let->name + ".max_lane")) { ib.min = Let::make(let->name + ".max_lane", ia.max, ib.min); } if (expr_uses_var(ib.max, let->name + ".max_lane")) { ib.max = Let::make(let->name + ".max_lane", ia.max, ib.max); } if (expr_uses_var(ib.min, let->name)) { ib.min = Let::make(let->name, let->value, ib.min); } if (expr_uses_var(ib.max, let->name)) { ib.max = Let::make(let->name, let->value, ib.max); } return ib; } // Take the explicit min and max over the lanes if (e.type().is_bool()) { Expr min_lane = VectorReduce::make(VectorReduce::And, e, 1); Expr max_lane = VectorReduce::make(VectorReduce::Or, e, 1); return {min_lane, max_lane}; } else { Expr min_lane = VectorReduce::make(VectorReduce::Min, e, 1); Expr max_lane = VectorReduce::make(VectorReduce::Max, e, 1); return {min_lane, max_lane}; } }; // A ramp with the lanes repeated (e.g. <0 0 2 2 4 4 6 6>) // TODO(vksnk): With nested vectorization, this will be representable // as a ramp(broadcast(a, repetitions), broadcast(b, repetitions, // lanes) struct InterleavedRamp { Expr base, stride; int lanes, repetitions; }; bool is_interleaved_ramp(const Expr &e, const Scope &scope, InterleavedRamp *result) { if (const Ramp *r = e.as()) { result->base = r->base; result->stride = r->stride; result->lanes = r->lanes; result->repetitions = 1; return true; } else if (const Broadcast *b = e.as()) { result->base = b->value; result->stride = 0; result->lanes = b->lanes; result->repetitions = 0; return true; } else if (const Add *add = e.as()) { InterleavedRamp ra; if (is_interleaved_ramp(add->a, scope, &ra) && is_interleaved_ramp(add->b, scope, result) && (ra.repetitions == 0 || result->repetitions == 0 || ra.repetitions == result->repetitions)) { result->base = simplify(result->base + ra.base); result->stride = simplify(result->stride + ra.stride); if (!result->repetitions) { result->repetitions = ra.repetitions; } return true; } } else if (const Sub *sub = e.as_{()) {
InterleavedRamp ra;
if (is_interleaved_ramp(sub->a, scope, &ra) &&
is_interleaved_ramp(sub->b, scope, result) &&
(ra.repetitions == 0 ||
result->repetitions == 0 ||
ra.repetitions == result->repetitions)) {
result->base = simplify(ra.base - result->base);
result->stride = simplify(ra.stride - result->stride);
if (!result->repetitions) {
result->repetitions = ra.repetitions;
}
return true;
}
} else if (const Mul *mul = e.as()) {
const int64_t *b = nullptr;
if (is_interleaved_ramp(mul->a, scope, result) &&
(b = as_const_int(mul->b))) {
result->base = simplify(result->base * (int)(*b));
result->stride = simplify(result->stride * (int)(*b));
return true;
}
} else if (const Div *div = e.as()) {
const int64_t *b = nullptr;
if (is_interleaved_ramp(div->a, scope, result) &&
(b = as_const_int(div->b)) &&
is_one(result->stride) &&
(result->repetitions == 1 ||
result->repetitions == 0) &&
can_prove((result->base % (int)(*b)) == 0)) {
// TODO: Generalize this. Currently only matches
// ramp(base*b, 1, lanes) / b
// broadcast(base * b, lanes) / b
result->base = simplify(result->base / (int)(*b));
result->repetitions *= (int)(*b);
return true;
}
} else if (const Variable *var = e.as()) {
if (scope.contains(var->name)) {
return is_interleaved_ramp(scope.get(var->name), scope, result);
}
}
return false;
}

// Allocations inside vectorized loops grow an additional inner
// dimension to represent the separate copy of the allocation per
// vector lane. This means loads and stores to them need to be
// rewritten slightly.
class RewriteAccessToVectorAlloc : public IRMutator {
Expr var;
string alloc;
int lanes;

using IRMutator::visit;

Expr mutate_index(const string &a, Expr index) {
index = mutate(index);
if (a == alloc) {
return index * lanes + var;
} else {
return index;
}
}

ModulusRemainder mutate_alignment(const string &a, const ModulusRemainder &align) {
if (a == alloc) {
return align * lanes;
} else {
return align;
}
}

Expr visit(const Load *op) override {
return Load::make(op->type, op->name, mutate_index(op->name, op->index),
op->image, op->param, mutate(op->predicate), mutate_alignment(op->name, op->alignment));
}

Stmt visit(const Store *op) override {
return Store::make(op->name, mutate(op->value), mutate_index(op->name, op->index),
op->param, mutate(op->predicate), mutate_alignment(op->name, op->alignment));
}

public:
RewriteAccessToVectorAlloc(const string &v, string a, int l)
: var(Variable::make(Int(32), v)), alloc(std::move(a)), lanes(l) {
}
};

class UsesGPUVars : public IRVisitor {
private:
using IRVisitor::visit;
void visit(const Variable *op) override {
if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
debug(3) << "Found gpu loop var: " << op->name << "\n";
uses_gpu = true;
}
}

public:
bool uses_gpu = false;
};

bool uses_gpu_vars(const Expr &s) {
UsesGPUVars uses;
s.accept(&uses);
return uses.uses_gpu;
}

// Wrap a vectorized predicate around a Load/Store node.
class PredicateLoadStore : public IRMutator {
string var;
Expr vector_predicate;
bool in_hexagon;
const Target ⌖
int lanes;
bool valid;
bool vectorized;

using IRMutator::visit;

bool should_predicate_store_load(int bit_size) {
if (in_hexagon) {
internal_assert(target.features_any_of({Target::HVX_64, Target::HVX_128}))
<< "We are inside a hexagon loop, but the target doesn't have hexagon's features\n";
return true;
} else if (target.arch == Target::X86) {
// Should only attempt to predicate store/load if the lane size is
// no less than 4
// TODO: disabling for now due to trunk LLVM breakage.
// See: https://github.com/halide/Halide/issues/3534
// return (bit_size == 32) && (lanes >= 4);
return false;
}
// For other architecture, do not predicate vector load/store
return false;
}

Expr merge_predicate(Expr pred, const Expr &new_pred) {
if (pred.type().lanes() == new_pred.type().lanes()) {
Expr res = simplify(pred && new_pred);
return res;
}
valid = false;
return pred;
}

Expr visit(const Load *op) override {
valid = valid && should_predicate_store_load(op->type.bits());
if (!valid) {
return op;
}

Expr predicate, index;
if (!op->index.type().is_scalar()) {
internal_assert(op->predicate.type().lanes() == lanes);
internal_assert(op->index.type().lanes() == lanes);

predicate = mutate(op->predicate);
index = mutate(op->index);
} else if (expr_uses_var(op->index, var)) {
predicate = mutate(Broadcast::make(op->predicate, lanes));
index = mutate(Broadcast::make(op->index, lanes));
} else {
return IRMutator::visit(op);
}

predicate = merge_predicate(predicate, vector_predicate);
if (!valid) {
return op;
}
vectorized = true;
return Load::make(op->type, op->name, index, op->image, op->param, predicate, op->alignment);
}

Stmt visit(const Store *op) override {
valid = valid && should_predicate_store_load(op->value.type().bits());
if (!valid) {
return op;
}

Expr predicate, value, index;
if (!op->index.type().is_scalar()) {
internal_assert(op->predicate.type().lanes() == lanes);
internal_assert(op->index.type().lanes() == lanes);
internal_assert(op->value.type().lanes() == lanes);

predicate = mutate(op->predicate);
value = mutate(op->value);
index = mutate(op->index);
} else if (expr_uses_var(op->index, var)) {
predicate = mutate(Broadcast::make(op->predicate, lanes));
value = mutate(Broadcast::make(op->value, lanes));
index = mutate(Broadcast::make(op->index, lanes));
} else {
return IRMutator::visit(op);
}

predicate = merge_predicate(predicate, vector_predicate);
if (!valid) {
return op;
}
vectorized = true;
return Store::make(op->name, value, index, op->param, predicate, op->alignment);
}

Expr visit(const Call *op) override {
// We should not vectorize calls with side-effects
valid = valid && op->is_pure();
return IRMutator::visit(op);
}

public:
PredicateLoadStore(string v, const Expr &vpred, bool in_hexagon, const Target &t)
: var(std::move(v)), vector_predicate(vpred), in_hexagon(in_hexagon), target(t),
lanes(vpred.type().lanes()), valid(true), vectorized(false) {
internal_assert(lanes > 1);
}

bool is_vectorized() const {
return valid && vectorized;
}
};

// Substitutes a vector for a scalar var in a Stmt. Used on the
// body of every vectorized loop.
class VectorSubs : public IRMutator {
// The var we're vectorizing
string var;

// What we're replacing it with. Usually a ramp.
Expr replacement;

const Target ⌖

bool in_hexagon; // Are we inside the hexagon loop?

// A suffix to attach to widened variables.
string widening_suffix;

// A scope containing lets and letstmts whose values became
// vectors.
Scope scope;

// The same set of Exprs, indexed by the vectorized var name
Scope vector_scope;

// A stack of all containing lets. We need to reinject the scalar
// version of them if we scalarize inner code.
vector> containing_lets;

// Widen an expression to the given number of lanes.
Expr widen(Expr e, int lanes) {
if (e.type().lanes() == lanes) {
return e;
} else if (e.type().lanes() == 1) {
return Broadcast::make(e, lanes);
} else {
internal_error << "Mismatched vector lanes in VectorSubs\n";
}
return Expr();
}

using IRMutator::visit;

Expr visit(const Cast *op) override {
Expr value = mutate(op->value);
if (value.same_as(op->value)) {
return op;
} else {
Type t = op->type.with_lanes(value.type().lanes());
return Cast::make(t, value);
}
}

Expr visit(const Variable *op) override {
string widened_name = op->name + widening_suffix;
if (op->name == var) {
return replacement;
} else if (scope.contains(op->name)) {
// If the variable appears in scope then we previously widened
// it and we use the new widened name for the variable.
return Variable::make(scope.get(op->name).type(), widened_name);
} else {
return op;
}
}

template
Expr mutate_binary_operator(const T *op) {
Expr a = mutate(op->a), b = mutate(op->b);
if (a.same_as(op->a) && b.same_as(op->b)) {
return op;
} else {
int w = std::max(a.type().lanes(), b.type().lanes());
return T::make(widen(a, w), widen(b, w));
}
}

Expr visit(const Add *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Sub *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Mul *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Div *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Mod *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Min *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Max *op) override {
return mutate_binary_operator(op);
}
Expr visit(const EQ *op) override {
return mutate_binary_operator(op);
}
Expr visit(const NE *op) override {
return mutate_binary_operator(op);
}
Expr visit(const LT *op) override {
return mutate_binary_operator(op);
}
Expr visit(const LE *op) override {
return mutate_binary_operator(op);
}
Expr visit(const GT *op) override {
return mutate_binary_operator(op);
}
Expr visit(const GE *op) override {
return mutate_binary_operator(op);
}
Expr visit(const And *op) override {
return mutate_binary_operator(op);
}
Expr visit(const Or *op) override {
return mutate_binary_operator(op);
}

Expr visit(const Select *op) override {
Expr condition = mutate(op->condition);
Expr true_value = mutate(op->true_value);
Expr false_value = mutate(op->false_value);
if (condition.same_as(op->condition) &&
true_value.same_as(op->true_value) &&
false_value.same_as(op->false_value)) {
return op;
} else {
int lanes = std::max(true_value.type().lanes(), false_value.type().lanes());
lanes = std::max(lanes, condition.type().lanes());
// Widen the true and false values, but we don't have to widen the condition
true_value = widen(true_value, lanes);
false_value = widen(false_value, lanes);
return Select::make(condition, true_value, false_value);
}
}

Expr visit(const Load *op) override {
Expr predicate = mutate(op->predicate);
Expr index = mutate(op->index);

if (predicate.same_as(op->predicate) && index.same_as(op->index)) {
return op;
} else {
int w = index.type().lanes();
predicate = widen(predicate, w);
return Load::make(op->type.with_lanes(w), op->name, index, op->image,
op->param, predicate, op->alignment);
}
}

Expr visit(const Call *op) override {
// Widen the call by changing the lanes of all of its
// arguments and its return type
vector new_args(op->args.size());
bool changed = false;

// Mutate the args
int max_lanes = 0;
for (size_t i = 0; i < op->args.size(); i++) {
Expr old_arg = op->args[i];
Expr new_arg = mutate(old_arg);
if (!new_arg.same_as(old_arg)) changed = true;
new_args[i] = new_arg;
max_lanes = std::max(new_arg.type().lanes(), max_lanes);
}

if (!changed) {
return op;
} else if (op->name == Call::trace) {
const int64_t *event = as_const_int(op->args[6]);
internal_assert(event != nullptr);
if (*event == halide_trace_begin_realization || *event == halide_trace_end_realization) {
// Call::trace vectorizes uniquely for begin/end realization, because the coordinates
// for these are actually min/extent pairs; we need to maintain the proper dimensionality
// count and instead aggregate the widened values into a single pair.
for (size_t i = 1; i <= 2; i++) {
const Call *call = new_args[i].as();
internal_assert(call && call->is_intrinsic(Call::make_struct));
if (i == 1) {
// values should always be empty for these events
internal_assert(call->args.empty());
continue;
}
vector call_args(call->args.size());
for (size_t j = 0; j < call_args.size(); j += 2) {
Expr min_v = widen(call->args[j], max_lanes);
Expr extent_v = widen(call->args[j + 1], max_lanes);
Expr min_scalar = extract_lane(min_v, 0);
Expr max_scalar = min_scalar + extract_lane(extent_v, 0);
for (int k = 1; k < max_lanes; ++k) {
Expr min_k = extract_lane(min_v, k);
Expr extent_k = extract_lane(extent_v, k);
min_scalar = min(min_scalar, min_k);
max_scalar = max(max_scalar, min_k + extent_k);
}
call_args[j] = min_scalar;
call_args[j + 1] = max_scalar - min_scalar;
}
new_args[i] = Call::make(call->type.element_of(), Call::make_struct, call_args, Call::Intrinsic);
}
} else {
// Call::trace vectorizes uniquely, because we want a
// single trace call for the entire vector, instead of
// scalarizing the call and tracing each element.
for (size_t i = 1; i <= 2; i++) {
// Each struct should be a struct-of-vectors, not a
// vector of distinct structs.
const Call *call = new_args[i].as();
internal_assert(call && call->is_intrinsic(Call::make_struct));
// Widen the call args to have the same lanes as the max lanes found
vector call_args(call->args.size());
for (size_t j = 0; j < call_args.size(); j++) {
call_args[j] = widen(call->args[j], max_lanes);
}
new_args[i] = Call::make(call->type.element_of(), Call::make_struct,
call_args, Call::Intrinsic);
}
// One of the arguments to the trace helper
// records the number of vector lanes in the type being
// stored.
new_args[5] = max_lanes;
// One of the arguments to the trace helper
// records the number entries in the coordinates (which we just widened)
if (max_lanes > 1) {
new_args[9] = new_args[9] * max_lanes;
}
}
return Call::make(op->type, Call::trace, new_args, op->call_type);
} else {
// Widen the args to have the same lanes as the max lanes found
for (size_t i = 0; i < new_args.size(); i++) {
new_args[i] = widen(new_args[i], max_lanes);
}
return Call::make(op->type.with_lanes(max_lanes), op->name, new_args,
op->call_type, op->func, op->value_index, op->image, op->param);
}
}

Expr visit(const Let *op) override {

// Vectorize the let value and check to see if it was vectorized by
// this mutator. The type of the expression might already be vector
// width.
Expr mutated_value = mutate(op->value);
bool was_vectorized = (!op->value.type().is_vector() &&
mutated_value.type().is_vector());

// If the value was vectorized by this mutator, add a new name to
// the scope for the vectorized value expression.
string vectorized_name;
if (was_vectorized) {
vectorized_name = op->name + widening_suffix;
scope.push(op->name, mutated_value);
vector_scope.push(vectorized_name, mutated_value);
}

Expr mutated_body = mutate(op->body);

InterleavedRamp ir;
if (is_interleaved_ramp(mutated_value, vector_scope, &ir)) {
return substitute(vectorized_name, mutated_value, mutated_body);
} else if (mutated_value.same_as(op->value) &&
mutated_body.same_as(op->body)) {
return op;
} else if (was_vectorized) {
scope.pop(op->name);
vector_scope.pop(vectorized_name);
return Let::make(vectorized_name, mutated_value, mutated_body);
} else {
return Let::make(op->name, mutated_value, mutated_body);
}
}

Stmt visit(const LetStmt *op) override {
Expr mutated_value = mutate(op->value);
string mutated_name = op->name;

// Check if the value was vectorized by this mutator.
bool was_vectorized = (!op->value.type().is_vector() &&
mutated_value.type().is_vector());

if (was_vectorized) {
mutated_name += widening_suffix;
scope.push(op->name, mutated_value);
vector_scope.push(mutated_name, mutated_value);
// Also keep track of the original let, in case inner code scalarizes.
containing_lets.emplace_back(op->name, op->value);
}

Stmt mutated_body = mutate(op->body);

if (was_vectorized) {
containing_lets.pop_back();
scope.pop(op->name);
vector_scope.pop(mutated_name);

// Inner code might have extracted my lanes using
// extract_lane, which introduces a shuffle_vector. If
// so we should define separate lets for the lanes and
// get it to use those instead.
mutated_body = ReplaceShuffleVectors(mutated_name).mutate(mutated_body);

// Check if inner code wants my individual lanes.
Type t = mutated_value.type();
for (int i = 0; i < t.lanes(); i++) {
string lane_name = mutated_name + ".lane." + std::to_string(i);
if (stmt_uses_var(mutated_body, lane_name)) {
mutated_body =
LetStmt::make(lane_name, extract_lane(mutated_value, i), mutated_body);
}
}

// Inner code may also have wanted my max or min lane
bool uses_min_lane = stmt_uses_var(mutated_body, mutated_name + ".min_lane");
bool uses_max_lane = stmt_uses_var(mutated_body, mutated_name + ".max_lane");

if (uses_min_lane || uses_max_lane) {
Interval i = bounds_of_lanes(mutated_value);

if (uses_min_lane) {
mutated_body =
LetStmt::make(mutated_name + ".min_lane", i.min, mutated_body);
}

if (uses_max_lane) {
mutated_body =
LetStmt::make(mutated_name + ".max_lane", i.max, mutated_body);
}
}
}

InterleavedRamp ir;
if (is_interleaved_ramp(mutated_value, vector_scope, &ir)) {
return substitute(mutated_name, mutated_value, mutated_body);
} else if (mutated_value.same_as(op->value) &&
mutated_body.same_as(op->body)) {
return op;
} else {
return LetStmt::make(mutated_name, mutated_value, mutated_body);
}
}

Stmt visit(const Provide *op) override {
vector new_args(op->args.size());
vector new_values(op->values.size());
bool changed = false;

// Mutate the args
int max_lanes = 0;
for (size_t i = 0; i < op->args.size(); i++) {
Expr old_arg = op->args[i];
Expr new_arg = mutate(old_arg);
if (!new_arg.same_as(old_arg)) changed = true;
new_args[i] = new_arg;
max_lanes = std::max(new_arg.type().lanes(), max_lanes);
}

for (size_t i = 0; i < op->args.size(); i++) {
Expr old_value = op->values[i];
Expr new_value = mutate(old_value);
if (!new_value.same_as(old_value)) changed = true;
new_values[i] = new_value;
max_lanes = std::max(new_value.type().lanes(), max_lanes);
}

if (!changed) {
return op;
} else {
// Widen the args to have the same lanes as the max lanes found
for (size_t i = 0; i < new_args.size(); i++) {
new_args[i] = widen(new_args[i], max_lanes);
}
for (size_t i = 0; i < new_values.size(); i++) {
new_values[i] = widen(new_values[i], max_lanes);
}
return Provide::make(op->name, new_values, new_args);
}
}

Stmt visit(const Store *op) override {
Expr predicate = mutate(op->predicate);
Expr value = mutate(op->value);
Expr index = mutate(op->index);

if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index)) {
return op;
} else {
int lanes = std::max(predicate.type().lanes(), std::max(value.type().lanes(), index.type().lanes()));
return Store::make(op->name, widen(value, lanes), widen(index, lanes),
op->param, widen(predicate, lanes), op->alignment);
}
}

Stmt visit(const AssertStmt *op) override {
return (op->condition.type().lanes() > 1) ? scalarize(op) : op;
}

Stmt visit(const IfThenElse *op) override {
Expr cond = mutate(op->condition);
int lanes = cond.type().lanes();
debug(3) << "Vectorizing over " << var << "\n"
<< "Old: " << op->condition << "\n"
<< "New: " << cond << "\n";

Stmt then_case = mutate(op->then_case);
Stmt else_case = mutate(op->else_case);

if (lanes > 1) {
// We have an if statement with a vector condition,
// which would mean control flow divergence within the
// SIMD lanes.

bool vectorize_predicate = !uses_gpu_vars(cond);
Stmt predicated_stmt;
if (vectorize_predicate) {
PredicateLoadStore p(var, cond, in_hexagon, target);
predicated_stmt = p.mutate(then_case);
vectorize_predicate = p.is_vectorized();
}
if (vectorize_predicate && else_case.defined()) {
PredicateLoadStore p(var, !cond, in_hexagon, target);
predicated_stmt = Block::make(predicated_stmt, p.mutate(else_case));
vectorize_predicate = p.is_vectorized();
}

debug(4) << "IfThenElse should vectorize predicate over var " << var << "? " << vectorize_predicate << "; cond: " << cond << "\n";
debug(4) << "Predicated stmt:\n"
<< predicated_stmt << "\n";

// First check if the condition is marked as likely.
const Call *c = cond.as();
if (c && (c->is_intrinsic(Call::likely) ||
c->is_intrinsic(Call::likely_if_innermost))) {

// The meaning of the likely intrinsic is that
// Halide should optimize for the case in which
// *every* likely value is true. We can do that by
// generating a scalar condition that checks if
// the least-true lane is true.
Expr all_true = bounds_of_lanes(c->args[0]).min;

// Wrap it in the same flavor of likely
all_true = Call::make(Bool(), c->name,
{all_true}, Call::PureIntrinsic);

if (!vectorize_predicate) {
// We should strip the likelies from the case
// that's going to scalarize, because it's no
// longer likely.
Stmt without_likelies =
IfThenElse::make(op->condition.as()->args[0],
op->then_case, op->else_case);
Stmt stmt =
IfThenElse::make(all_true,
then_case,
scalarize(without_likelies));
debug(4) << "...With all_true likely: \n"
<< stmt << "\n";
return stmt;
} else {
Stmt stmt =
IfThenElse::make(all_true,
then_case,
predicated_stmt);
debug(4) << "...Predicated IfThenElse: \n"
<< stmt << "\n";
return stmt;
}
} else {
// It's some arbitrary vector condition.
if (!vectorize_predicate) {
debug(4) << "...Scalarizing vector predicate: \n"
<< Stmt(op) << "\n";
return scalarize(op);
} else {
Stmt stmt = predicated_stmt;
debug(4) << "...Predicated IfThenElse: \n"
<< stmt << "\n";
return stmt;
}
}
} else {
// It's an if statement on a scalar, we're ok to vectorize the innards.
debug(3) << "Not scalarizing if then else\n";
if (cond.same_as(op->condition) &&
then_case.same_as(op->then_case) &&
else_case.same_as(op->else_case)) {
return op;
} else {
return IfThenElse::make(cond, then_case, else_case);
}
}
}

Stmt visit(const For *op) override {
ForType for_type = op->for_type;
if (for_type == ForType::Vectorized) {
user_warning << "Warning: Encountered vector for loop over " << op->name
<< " inside vector for loop over " << var << "."
<< " Ignoring the vectorize directive for the inner for loop.\n";
for_type = ForType::Serial;
}

Expr min = mutate(op->min);
Expr extent = mutate(op->extent);

Stmt body = op->body;

if (min.type().is_vector()) {
// Rebase the loop to zero and try again
Expr var = Variable::make(Int(32), op->name);
Stmt body = substitute(op->name, var + op->min, op->body);
Stmt transformed = For::make(op->name, 0, op->extent, for_type, op->device_api, body);
return mutate(transformed);
}

if (extent.type().is_vector()) {
// We'll iterate up to the max over the lanes, but
// inject an if statement inside the loop that stops
// each lane from going too far.

extent = bounds_of_lanes(extent).max;
Expr var = Variable::make(Int(32), op->name);
body = IfThenElse::make(likely(var < op->min + op->extent), body);
}

body = mutate(body);

if (min.same_as(op->min) &&
extent.same_as(op->extent) &&
body.same_as(op->body) &&
for_type == op->for_type) {
return op;
} else {
return For::make(op->name, min, extent, for_type, op->device_api, body);
}
}

Stmt visit(const Allocate *op) override {
vector new_extents;
Expr new_expr;

int lanes = replacement.type().lanes();

// The new expanded dimension is innermost.
new_extents.emplace_back(lanes);

for (size_t i = 0; i < op->extents.size(); i++) {
Expr extent = mutate(op->extents[i]);
// For vector sizes, take the max over the lanes. Note
// that we haven't changed the strides, which also may
// vary per lane. This is a bit weird, but the way we
// set up the vectorized memory means that lanes can't
// clobber each others' memory, so it doesn't matter.
if (extent.type().is_vector()) {
extent = bounds_of_lanes(extent).max;
}
new_extents.push_back(extent);
}

if (op->new_expr.defined()) {
new_expr = mutate(op->new_expr);
user_assert(new_expr.type().is_scalar())
<< "Cannot vectorize an allocation with a varying new_expr per vector lane.\n";
}

Stmt body = op->body;

// Rewrite loads and stores to this allocation like so:
// foo[x] -> foo[x*lanes + v]
string v = unique_name('v');
body = RewriteAccessToVectorAlloc(v, op->name, lanes).mutate(body);

scope.push(v, Ramp::make(0, 1, lanes));
body = mutate(body);
scope.pop(v);

// Replace the widened 'v' with the actual ramp
// foo[x*lanes + widened_v] -> foo[x*lanes + ramp(0, 1, lanes)]
body = substitute(v + widening_suffix, Ramp::make(0, 1, lanes), body);

// The variable itself could still exist inside an inner scalarized block.
body = substitute(v, Variable::make(Int(32), var), body);

return Allocate::make(op->name, op->type, op->memory_type, new_extents, op->condition, body, new_expr, op->free_function);
}

Stmt visit(const Atomic *op) override {
// Recognize a few special cases that we can handle as within-vector reduction trees.
do {
if (!op->mutex_name.empty()) {
// We can't vectorize over a mutex
break;
}

// f[x] = f[x] y
const Store *store = op->body.as();
if (!store) break;

VectorReduce::Operator reduce_op = VectorReduce::Add;
Expr a, b;
if (const Add *add = store->value.as()) {
a = add->a;
b = add->b;
reduce_op = VectorReduce::Add;
} else if (const Mul *mul = store->value.as()) {
a = mul->a;
b = mul->b;
reduce_op = VectorReduce::Mul;
} else if (const Min *min = store->value.as()) {
a = min->a;
b = min->b;
reduce_op = VectorReduce::Min;
} else if (const Max *max = store->value.as()) {
a = max->a;
b = max->b;
reduce_op = VectorReduce::Max;
} else if (const Cast *cast_op = store->value.as()) {
if (cast_op->type.element_of() == UInt(8) &&
cast_op->value.type().is_bool()) {
if (const And *and_op = cast_op->value.as()) {
a = and_op->a;
b = and_op->b;
reduce_op = VectorReduce::And;
} else if (const Or *or_op = cast_op->value.as()) {
a = or_op->a;
b = or_op->b;
reduce_op = VectorReduce::Or;
}
}
}

if (!a.defined() || !b.defined()) {
break;
}

// Bools get cast to uint8 for storage. Strip off that
// cast around any load.
if (b.type().is_bool()) {
const Cast *cast_op = b.as();
if (cast_op) {
b = cast_op->value;
}
}
if (a.type().is_bool()) {
const Cast *cast_op = b.as();
if (cast_op) {
a = cast_op->value;
}
}

if (a.as() && !b.as()) {
std::swap(a, b);
}

// We require b to be a var, because it should have been lifted.
const Variable *var_b = b.as();
const Load *load_a = a.as();

if (!var_b ||
!scope.contains(var_b->name) ||
!load_a ||
load_a->name != store->name ||
!is_one(load_a->predicate) ||
!is_one(store->predicate)) {
break;
}

b = scope.get(var_b->name);
Expr store_index = mutate(store->index);
Expr load_index = mutate(load_a->index);

// The load and store indices must be the same interleaved
// ramp (or the same scalar, in the total reduction case).
InterleavedRamp store_ir, load_ir;
Expr test;
if (store_index.type().is_scalar()) {
test = simplify(load_index == store_index);
} else if (is_interleaved_ramp(store_index, vector_scope, &store_ir) &&
is_interleaved_ramp(load_index, vector_scope, &load_ir) &&
store_ir.repetitions == load_ir.repetitions &&
store_ir.lanes == load_ir.lanes) {
test = simplify(store_ir.base == load_ir.base &&
store_ir.stride == load_ir.stride);
}

if (!test.defined()) {
break;
}

if (is_zero(test)) {
break;
} else if (!is_one(test)) {
// TODO: try harder by substituting in more things in scope
break;
}

int output_lanes = 1;
if (store_index.type().is_scalar()) {
// The index doesn't depend on the value being
// vectorized, so it's a total reduction.

b = VectorReduce::make(reduce_op, b, 1);
} else {

output_lanes = store_index.type().lanes() / store_ir.repetitions;

store_index = Ramp::make(store_ir.base, store_ir.stride, output_lanes);
b = VectorReduce::make(reduce_op, b, output_lanes);
}

Expr new_load = Load::make(load_a->type.with_lanes(output_lanes),
load_a->name, store_index, load_a->image,
load_a->param, const_true(output_lanes),
ModulusRemainder{});

switch (reduce_op) {
case VectorReduce::Add:
b = new_load + b;
break;
case VectorReduce::Mul:
b = new_load * b;
break;
case VectorReduce::Min:
b = min(new_load, b);
break;
case VectorReduce::Max:
b = max(new_load, b);
break;
case VectorReduce::And:
b = cast(new_load.type(), cast(b.type(), new_load) && b);
break;
case VectorReduce::Or:
b = cast(new_load.type(), cast(b.type(), new_load) || b);
break;
}

Stmt s = Store::make(store->name, b, store_index, store->param,
const_true(b.type().lanes()), store->alignment);

// We may still need the atomic node, if there was more
// parallelism than just the vectorization.
s = Atomic::make(op->producer_name, op->mutex_name, s);

return s;
} while (0);

// In the general case, if a whole stmt has to be done
// atomically, we need to serialize.
return scalarize(op);
}

Stmt scalarize(Stmt s) {
// Wrap a serial loop around it. Maybe LLVM will have
// better luck vectorizing it.

// We'll need the original scalar versions of any containing lets.
for (size_t i = containing_lets.size(); i > 0; i--) {
const auto &l = containing_lets[i - 1];
s = LetStmt::make(l.first, l.second, s);
}

const Ramp *r = replacement.as();
internal_assert(r) << "Expected replacement in VectorSubs to be a ramp\n";
return For::make(var, r->base, r->lanes, ForType::Serial, DeviceAPI::None, s);
}

Expr scalarize(Expr e) {
// This method returns a select tree that produces a vector lanes
// result expression

Expr result;
int lanes = replacement.type().lanes();

for (int i = lanes - 1; i >= 0; --i) {
// Hide all the vector let values in scope with a scalar version
// in the appropriate lane.
for (Scope::const_iterator iter = scope.cbegin(); iter != scope.cend(); ++iter) {
string name = iter.name() + ".lane." + std::to_string(i);
Expr lane = extract_lane(iter.value(), i);
e = substitute(iter.name(), Variable::make(lane.type(), name), e);
}

// Replace uses of the vectorized variable with the extracted
// lane expression
e = substitute(var, i, e);

if (i == lanes - 1) {
result = Broadcast::make(e, lanes);
} else {
Expr cond = (replacement == Broadcast::make(i, lanes));
result = Select::make(cond, Broadcast::make(e, lanes), result);
}
}

return result;
}

public:
VectorSubs(string v, Expr r, bool in_hexagon, const Target &t)
: var(std::move(v)), replacement(std::move(r)), target(t), in_hexagon(in_hexagon) {
widening_suffix = ".x" + std::to_string(replacement.type().lanes());
}
}; // namespace

class FindVectorizableExprsInAtomicNode : public IRMutator {
// An Atomic node protects all accesses to a given buffer. We
// consider a name "poisoned" if it depends on an access to this
// buffer. We can't lift or vectorize anything that has been
// poisoned.
Scope<> poisoned_names;
bool poison = false;

using IRMutator::visit;

template
const T *visit_let(const T *op) {
mutate(op->value);
ScopedBinding<> bind_if(poison, poisoned_names, op->name);
mutate(op->body);
return op;
}

Stmt visit(const LetStmt *op) override {
return visit_let(op);
}

Expr visit(const Let *op) override {
return visit_let(op);
}

Expr visit(const Load *op) override {
// Even if the load is bad, maybe we can lift the index
IRMutator::visit(op);

poison |= poisoned_names.contains(op->name);
return op;
}

Expr visit(const Variable *op) override {
poison = poisoned_names.contains(op->name);
return op;
}

Stmt visit(const Store *op) override {
// A store poisons all subsequent loads, but loads before the
// first store can be lifted.
mutate(op->index);
mutate(op->value);
poisoned_names.push(op->name);
return op;
}

Expr visit(const Call *op) override {
IRMutator::visit(op);
poison |= !op->is_pure();
return op;
}

public:
using IRMutator::mutate;

Expr mutate(const Expr &e) override {
bool old_poison = poison;
poison = false;
IRMutator::mutate(e);
if (!poison) {
liftable.insert(e);
}
poison |= old_poison;
// We're not actually mutating anything. This class is only a
// mutator so that we can override a generic mutate() method.
return e;
}

FindVectorizableExprsInAtomicNode(const string &buf, const map &env) {
poisoned_names.push(buf);
auto it = env.find(buf);
if (it != env.end()) {
// Handle tuples
size_t n = it->second.values().size();
if (n > 1) {
for (size_t i = 0; i < n; i++) {
poisoned_names.push(buf + "." + std::to_string(i));
}
}
}
}

std::set liftable;
};

class LiftVectorizableExprsOutOfSingleAtomicNode : public IRMutator {
const std::set &liftable;

using IRMutator::visit;

template
StmtOrExpr visit_let(const LetStmtOrLet *op) {
if (liftable.count(op->value)) {
// Lift it under its current name to avoid having to
// rewrite the variables in other lifted exprs.
// TODO: duplicate non-overlapping liftable let stmts due to unrolling.
lifted.emplace_back(op->name, op->value);
return mutate(op->body);
} else {
return IRMutator::visit(op);
}
}

Stmt visit(const LetStmt *op) override {
return visit_let(op);
}

Expr visit(const Let *op) override {
return visit_let(op);
}

public:
map already_lifted;
vector> lifted;

using IRMutator::mutate;

Expr mutate(const Expr &e) override {
if (liftable.count(e) && !is_const(e) && !e.as()) {
auto it = already_lifted.find(e);
string name;
if (it != already_lifted.end()) {
name = it->second;
} else {
name = unique_name('t');
lifted.emplace_back(name, e);
already_lifted.emplace(e, name);
}
return Variable::make(e.type(), name);
} else {
return IRMutator::mutate(e);
}
}

LiftVectorizableExprsOutOfSingleAtomicNode(const std::set &liftable)
: liftable(liftable) {
}
};

class LiftVectorizableExprsOutOfAllAtomicNodes : public IRMutator {
using IRMutator::visit;

Stmt visit(const Atomic *op) override {
FindVectorizableExprsInAtomicNode finder(op->producer_name, env);
finder.mutate(op->body);
LiftVectorizableExprsOutOfSingleAtomicNode lifter(finder.liftable);
Stmt new_body = lifter.mutate(op->body);
new_body = Atomic::make(op->producer_name, op->mutex_name, new_body);
while (!lifter.lifted.empty()) {
auto p = lifter.lifted.back();
new_body = LetStmt::make(p.first, p.second, new_body);
lifter.lifted.pop_back();
}
return new_body;
}

const map &env;

public:
LiftVectorizableExprsOutOfAllAtomicNodes(const map &env)
: env(env) {
}
};

// Vectorize all loops marked as such in a Stmt
class VectorizeLoops : public IRMutator {
const Target ⌖
bool in_hexagon;

using IRMutator::visit;

Stmt visit(const For *for_loop) override {
bool old_in_hexagon = in_hexagon;
if (for_loop->device_api == DeviceAPI::Hexagon) {
in_hexagon = true;
}

Stmt stmt;
if (for_loop->for_type == ForType::Vectorized) {
const IntImm *extent = for_loop->extent.as();
if (!extent || extent->value <= 1) {
user_error << "Loop over " << for_loop->name
<< " has extent " << for_loop->extent
<< ". Can only vectorize loops over a "
<< "constant extent > 1\n";
}

// Replace the var with a ramp within the body
Expr for_var = Variable::make(Int(32), for_loop->name);
Expr replacement = Ramp::make(for_loop->min, 1, extent->value);
stmt = VectorSubs(for_loop->name, replacement, in_hexagon, target).mutate(for_loop->body);
} else {
stmt = IRMutator::visit(for_loop);
}

if (for_loop->device_api == DeviceAPI::Hexagon) {
in_hexagon = old_in_hexagon;
}

return stmt;
}

public:
VectorizeLoops(const Target &t)
: target(t), in_hexagon(false) {
}
};

/** Check if all stores in a Stmt are to names in a given scope. Used
by RemoveUnnecessaryAtomics below. */
class AllStoresInScope : public IRVisitor {
using IRVisitor::visit;
void visit(const Store *op) override {
result = result && s.contains(op->name);
}

public:
bool result = true;
const Scope<> &s;
AllStoresInScope(const Scope<> &s)
: s(s) {
}
};
bool all_stores_in_scope(const Stmt &stmt, const Scope<> &scope) {
AllStoresInScope checker(scope);
stmt.accept(&checker);
return checker.result;
}

/** Drop any atomic nodes protecting buffers that are only accessed
* from a single thread. */
class RemoveUnnecessaryAtomics : public IRMutator {
using IRMutator::visit;

// Allocations made from within this same thread
bool in_thread = false;
Scope<> local_allocs;

Stmt visit(const Allocate *op) override {
ScopedBinding<> bind(local_allocs, op->name);
return IRMutator::visit(op);
}

Stmt visit(const Atomic *op) override {
if (!in_thread || all_stores_in_scope(op->body, local_allocs)) {
return mutate(op->body);
} else {
return op;
}
}

Stmt visit(const For *op) override {
if (is_parallel(op->for_type)) {
ScopedValue old_in_thread(in_thread, true);
Scope<> old_local_allocs;
old_local_allocs.swap(local_allocs);
Stmt s = IRMutator::visit(op);
old_local_allocs.swap(local_allocs);
return s;
} else {
return IRMutator::visit(op);
}
}
};

} // namespace

Stmt vectorize_loops(const Stmt &stmt, const map &env, const Target &t) {
// Limit the scope of atomic nodes to just the necessary stuff.
// TODO: Should this be an earlier pass? It's probably a good idea
// for non-vectorizing stuff too.
Stmt s = LiftVectorizableExprsOutOfAllAtomicNodes(env).mutate(stmt);
s = VectorizeLoops(t).mutate(s);
s = RemoveUnnecessaryAtomics().mutate(s);
return s;
}

} // namespace Internal
} // namespace Halide}