1 /*
2  *  yosys -- Yosys Open SYnthesis Suite
3  *
4  *  Copyright (C) 2018  whitequark <whitequark@whitequark.org>
5  *
6  *  Permission to use, copy, modify, and/or distribute this software for any
7  *  purpose with or without fee is hereby granted, provided that the above
8  *  copyright notice and this permission notice appear in all copies.
9  *
10  *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  *
18  */
19 
20 // [[CITE]] FlowMap algorithm
21 // Jason Cong; Yuzheng Ding, "An Optimal Technology Mapping Algorithm for Delay Optimization in Lookup-Table Based FPGA Designs,"
22 // Computer-Aided Design of Integrated Circuits and Systems, IEEE Transactions on, Vol. 13, pp. 1-12, Jan. 1994.
23 // doi: 10.1109/43.273754
24 
25 // [[CITE]] FlowMap-r algorithm
26 // Jason Cong; Yuzheng Ding, "On Area/Depth Tradeoff in LUT-Based FPGA Technology Mapping,"
27 // Very Large Scale Integration Systems, IEEE Transactions on, Vol. 2, June 1994.
28 // doi: 10.1109/92.28574
29 
30 // Required reading material:
31 //
32 // Min-cut max-flow theorem:
33 //   https://www.coursera.org/lecture/algorithms-part2/maxflow-mincut-theorem-beb9G
34 // FlowMap paper:
35 //   http://cadlab.cs.ucla.edu/~cong/papers/iccad92.pdf   (short version)
36 //   https://limsk.ece.gatech.edu/book/papers/flowmap.pdf (long version)
37 // FlowMap-r paper:
38 //   http://cadlab.cs.ucla.edu/~cong/papers/dac93.pdf     (short version)
39 //   https://sci-hub.tw/10.1109/92.285741                 (long version)
40 
41 // Notes on correspondence between paper and implementation:
42 //
43 // 1. In the FlowMap paper, the nodes are logic elements (analogous to Yosys cells) and edges are wires. However, in our implementation,
44 // we use an inverted approach: the nodes are Yosys wire bits, and the edges are derived from (but aren't represented by) Yosys cells.
45 // This may seem counterintuitive. Three observations may help understanding this. First, for a cell with a 1-bit Y output that is
46 // the sole driver of its output net (which is the typical case), these representations are equivalent, because there is an exact
47 // correspondence between cells and output wires. Second, in the paper, primary inputs (analogous to Yosys cell or module ports) are
48 // nodes, and in Yosys, inputs are wires; our approach allows a direct mapping from both primary inputs and 1-output logic elements to
49 // flow graph nodes. Third, Yosys cells may have multiple outputs or multi-bit outputs, and by using Yosys wire bits as flow graph nodes,
50 // such cells are supported without any additional effort; any Yosys cell with n output wire bits ends up being split into n flow graph
51 // nodes.
52 //
53 // 2. The FlowMap paper introduces three networks: Nt, Nt', and Nt''. The network Nt is directly represented by a subgraph of RTLIL graph,
54 // which is parsed into an equivalent but easier to traverse representation in FlowmapWorker. The network Nt' is built explicitly
55 // from a subgraph of Nt, and uses a similar representation in FlowGraph. The network Nt'' is implicit in FlowGraph, which is possible
56 // because of the following observation: each Nt' node corresponds to an Nt'' edge of capacity 1, and each Nt' edge corresponds to
57 // an Nt'' edge of capacity ∞. Therefore, we only need to explicitly record flow for Nt' edges and through Nt' nodes.
58 //
59 // 3. The FlowMap paper ambiguously states: "Moreover, we can find such a cut (X′′, X̅′′) by performing a depth first search starting at
60 // the source s, and including in X′′ all the nodes which are reachable from s." This actually refers to a specific kind of search,
61 // min-cut computation. Min-cut computation involves computing the set of nodes reachable from s by an undirected path with no full
62 // (i.e. zero capacity) forward edges or empty (i.e. no flow) backward edges. In addition, the depth first search is required to compute
63 // a max-volume max-flow min-cut specifically, because a max-flow min-cut is not, in general, unique.
64 
65 // Notes on implementation:
66 //
67 // 1. To compute depth optimal packing, an intermediate representation is used, where each cell with n output bits is split into n graph
68 // nodes. Each such graph node is represented directly with the wire bit (RTLIL::SigBit instance) that corresponds to the output bit
69 // it is created from. Fan-in and fan-out are represented explicitly by edge lists derived from the RTLIL graph. This IR never changes
70 // after it has been computed.
71 //
72 // In terms of data, this IR is comprised of `inputs`, `outputs`, `nodes`, `edges_fw` and `edges_bw` fields.
73 //
74 // We call this IR "gate IR".
75 //
76 // 2. To compute area optimal packing, another intermediate representation is used, which consists of some K-feasible cone for every node
77 // that exists in the gate IR. Immediately after depth optimal packing with FlowMap, each such cone occupies the lowest possible depth,
78 // but this is not true in general, and transformations of this IR may change the cones, although each transformation has to keep each
79 // cone K-feasible. In this IR, LUT fan-in and fan-out are represented explicitly by edge lists; if a K-feasible cone chosen for node A
80 // includes nodes B and C, there are edges between all predecessors of A, B and C in the gate IR and node A in this IR. Moreover, in
81 // this IR, cones may be *realized* or *derealized*. Only realized cones will end up mapped to actual LUTs in the output of this pass.
82 //
83 // Intuitively, this IR contains (some, ideally but not necessarily optimal) LUT representation for each input cell. By starting at outputs
84 // and traversing the graph of this IR backwards, each K-feasible cone is converted to an actual LUT at the end of the pass. This is
85 // the same as iterating through each realized LUT.
86 //
87 // The following are the invariants of this IR:
88 //   a) Each gate IR node corresponds to a K-feasible cut.
89 //   b) Each realized LUT is reachable through backward edges from some output.
90 //   c) The LUT fan-in is exactly the fan-in of its constituent gates minus the fan-out of its constituent gates.
91 // The invariants are kept even for derealized LUTs, since the whole point of this IR is ease of packing, unpacking, and repacking LUTs.
92 //
93 // In terms of data, this IR is comprised of `lut_nodes` (the set of all realized LUTs), `lut_gates` (the map from a LUT to its
94 // constituent gates), `lut_edges_fw` and `lut_edges_bw` fields. The `inputs` and `outputs` fields are shared with the gate IR.
95 //
96 // We call this IR "LUT IR".
97 
98 #include "kernel/yosys.h"
99 #include "kernel/sigtools.h"
100 #include "kernel/modtools.h"
101 #include "kernel/consteval.h"
102 
103 USING_YOSYS_NAMESPACE
104 PRIVATE_NAMESPACE_BEGIN
105 
106 struct GraphStyle
107 {
108 	string label;
109 	string color, fillcolor;
110 
GraphStyleGraphStyle111 	GraphStyle(string label = "", string color = "black", string fillcolor = "") :
112 		label(label), color(color), fillcolor(fillcolor) {}
113 };
114 
dot_escape(string value)115 static string dot_escape(string value)
116 {
117 	std::string escaped;
118 	for (char c : value) {
119 		if (c == '\n')
120 		{
121 			escaped += "\\n";
122 			continue;
123 		}
124 		if (c == '\\' || c == '"')
125 			escaped += "\\";
126 		escaped += c;
127 	}
128 	return escaped;
129 }
130 
dump_dot_graph(string filename,pool<RTLIL::SigBit> nodes,dict<RTLIL::SigBit,pool<RTLIL::SigBit>> edges,pool<RTLIL::SigBit> inputs,pool<RTLIL::SigBit> outputs,std::function<GraphStyle (RTLIL::SigBit)> node_style=[](RTLIL::SigBit){},std::function<GraphStyle (RTLIL::SigBit,RTLIL::SigBit)> edge_style=[](RTLIL::SigBit,RTLIL::SigBit){},string name="")131 static void dump_dot_graph(string filename,
132                            pool<RTLIL::SigBit> nodes, dict<RTLIL::SigBit, pool<RTLIL::SigBit>> edges,
133                            pool<RTLIL::SigBit> inputs, pool<RTLIL::SigBit> outputs,
134                            std::function<GraphStyle(RTLIL::SigBit)> node_style =
135                                    [](RTLIL::SigBit) { return GraphStyle{}; },
136                            std::function<GraphStyle(RTLIL::SigBit, RTLIL::SigBit)> edge_style =
__anona209c1680202(RTLIL::SigBit, RTLIL::SigBit) 137                                    [](RTLIL::SigBit, RTLIL::SigBit) { return GraphStyle{}; },
138                            string name = "")
139 {
140 	FILE *f = fopen(filename.c_str(), "w");
141 	fprintf(f, "digraph \"%s\" {\n", name.c_str());
142 	fprintf(f, "  rankdir=\"TB\";\n");
143 
144 	dict<RTLIL::SigBit, int> ids;
145 	for (auto node : nodes)
146 	{
147 		ids[node] = ids.size();
148 
149 		string shape = "ellipse";
150 		if (inputs[node])
151 			shape = "box";
152 		if (outputs[node])
153 			shape = "octagon";
154 		auto prop = node_style(node);
155 		string style = "";
156 		if (!prop.fillcolor.empty())
157 			style = "filled";
158 		fprintf(f, "  n%d [ shape=%s, fontname=\"Monospace\", label=\"%s\", color=\"%s\", fillcolor=\"%s\", style=\"%s\" ];\n",
159 		        ids[node], shape.c_str(), dot_escape(prop.label.c_str()).c_str(), prop.color.c_str(), prop.fillcolor.c_str(), style.c_str());
160 	}
161 
162 	fprintf(f, "  { rank=\"source\"; ");
163 	for (auto input : inputs)
164 		if (nodes[input])
165 			fprintf(f, "n%d; ", ids[input]);
166 	fprintf(f, "}\n");
167 
168 	fprintf(f, "  { rank=\"sink\"; ");
169 	for (auto output : outputs)
170 		if (nodes[output])
171 			fprintf(f, "n%d; ", ids[output]);
172 	fprintf(f, "}\n");
173 
174 	for (auto edge : edges)
175 	{
176 		auto source = edge.first;
177 		for (auto sink : edge.second) {
178 			if (nodes[source] && nodes[sink])
179 			{
180 				auto prop = edge_style(source, sink);
181 				fprintf(f, "  n%d -> n%d [ label=\"%s\", color=\"%s\", fillcolor=\"%s\" ];\n",
182 				        ids[source], ids[sink], dot_escape(prop.label.c_str()).c_str(), prop.color.c_str(), prop.fillcolor.c_str());
183 			}
184 		}
185 	}
186 
187 	fprintf(f, "}\n");
188 	fclose(f);
189 }
190 
191 struct FlowGraph
192 {
193 	const RTLIL::SigBit source;
194 	RTLIL::SigBit sink;
195 	pool<RTLIL::SigBit> nodes = {source};
196 	dict<RTLIL::SigBit, pool<RTLIL::SigBit>> edges_fw, edges_bw;
197 
198 	const int MAX_NODE_FLOW = 1;
199 	dict<RTLIL::SigBit, int> node_flow;
200 	dict<pair<RTLIL::SigBit, RTLIL::SigBit>, int> edge_flow;
201 
202 	dict<RTLIL::SigBit, pool<RTLIL::SigBit>> collapsed;
203 
dump_dot_graphFlowGraph204 	void dump_dot_graph(string filename)
205 	{
206 		auto node_style = [&](RTLIL::SigBit node) {
207 			string label = (node == source) ? "(source)" : log_signal(node);
208 			for (auto collapsed_node : collapsed[node])
209 				label += stringf(" %s", log_signal(collapsed_node));
210 			int flow = node_flow[node];
211 			if (node != source && node != sink)
212 				label += stringf("\n%d/%d", flow, MAX_NODE_FLOW);
213 			else
214 				label += stringf("\n%d/∞", flow);
215 			return GraphStyle{label, flow < MAX_NODE_FLOW ? "green" : "black"};
216 		};
217 		auto edge_style = [&](RTLIL::SigBit source, RTLIL::SigBit sink) {
218 			int flow = edge_flow[{source, sink}];
219 			return GraphStyle{stringf("%d/∞", flow), flow > 0 ? "blue" : "black"};
220 		};
221 		::dump_dot_graph(filename, nodes, edges_fw, {source}, {sink}, node_style, edge_style);
222 	}
223 
224 	// Here, we are working on the Nt'' network, but our representation is the Nt' network.
225 	// The difference between these is that where in Nt' we have a subgraph:
226 	//
227 	//   v1 -> v2 -> v3
228 	//
229 	// in Nt'' we have a corresponding subgraph:
230 	//
231 	//   v'1b -∞-> v'2t -f-> v'2b -∞-> v'3t
232 	//
233 	// To address this, we split each node v into two nodes, v't and v'b. This representation is virtual,
234 	// in the sense that nodes v't and v'b are overlaid on top of the original node v, and only exist
235 	// in paths and worklists.
236 
237 	struct NodePrime
238 	{
239 		RTLIL::SigBit node;
240 		bool is_bottom;
241 
NodePrimeFlowGraph::NodePrime242 		NodePrime(RTLIL::SigBit node, bool is_bottom) :
243 			node(node), is_bottom(is_bottom) {}
244 
operator ==FlowGraph::NodePrime245 		bool operator==(const NodePrime &other) const
246 		{
247 			return node == other.node && is_bottom == other.is_bottom;
248 		}
operator !=FlowGraph::NodePrime249 		bool operator!=(const NodePrime &other) const
250 		{
251 			return !(*this == other);
252 		}
hashFlowGraph::NodePrime253 		unsigned int hash() const
254 		{
255 			return hash_ops<pair<RTLIL::SigBit, int>>::hash({node, is_bottom});
256 		}
257 
topFlowGraph::NodePrime258 		static NodePrime top(RTLIL::SigBit node)
259 		{
260 			return NodePrime(node, /*is_bottom=*/false);
261 		}
262 
bottomFlowGraph::NodePrime263 		static NodePrime bottom(RTLIL::SigBit node)
264 		{
265 			return NodePrime(node, /*is_bottom=*/true);
266 		}
267 
as_topFlowGraph::NodePrime268 		NodePrime as_top() const
269 		{
270 			log_assert(is_bottom);
271 			return top(node);
272 		}
273 
as_bottomFlowGraph::NodePrime274 		NodePrime as_bottom() const
275 		{
276 			log_assert(!is_bottom);
277 			return bottom(node);
278 		}
279 	};
280 
find_augmenting_pathFlowGraph281 	bool find_augmenting_path(bool commit)
282 	{
283 		NodePrime source_prime = {source, true};
284 		NodePrime sink_prime = {sink, false};
285 		vector<NodePrime> path = {source_prime};
286 		pool<NodePrime> visited = {};
287 		bool found;
288 		do {
289 			found = false;
290 
291 			auto node_prime = path.back();
292 			visited.insert(node_prime);
293 
294 			if (!node_prime.is_bottom) // vt
295 			{
296 				if (!visited[node_prime.as_bottom()] && node_flow[node_prime.node] < MAX_NODE_FLOW)
297 				{
298 					path.push_back(node_prime.as_bottom());
299 					found = true;
300 				}
301 				else
302 				{
303 					for (auto node_pred : edges_bw[node_prime.node])
304 					{
305 						if (!visited[NodePrime::bottom(node_pred)] && edge_flow[{node_pred, node_prime.node}] > 0)
306 						{
307 							path.push_back(NodePrime::bottom(node_pred));
308 							found = true;
309 							break;
310 						}
311 					}
312 				}
313 			}
314 			else // vb
315 			{
316 				if (!visited[node_prime.as_top()] && node_flow[node_prime.node] > 0)
317 				{
318 					path.push_back(node_prime.as_top());
319 					found = true;
320 				}
321 				else
322 				{
323 					for (auto node_succ : edges_fw[node_prime.node])
324 					{
325 						if (!visited[NodePrime::top(node_succ)] /* && edge_flow[...] < ∞ */)
326 						{
327 							path.push_back(NodePrime::top(node_succ));
328 							found = true;
329 							break;
330 						}
331 					}
332 				}
333 			}
334 
335 			if (!found && path.size() > 1)
336 			{
337 				path.pop_back();
338 				found = true;
339 			}
340 		} while(path.back() != sink_prime && found);
341 
342 		if (commit && path.back() == sink_prime)
343 		{
344 			auto prev_prime = path.front();
345 			for (auto node_prime : path)
346 			{
347 				if (node_prime == source_prime)
348 					continue;
349 
350 				log_assert(prev_prime.is_bottom ^ node_prime.is_bottom);
351 				if (prev_prime.node == node_prime.node)
352 				{
353 					auto node = node_prime.node;
354 					if (!prev_prime.is_bottom && node_prime.is_bottom)
355 					{
356 						log_assert(node_flow[node] == 0);
357 						node_flow[node]++;
358 					}
359 					else
360 					{
361 						log_assert(node_flow[node] != 0);
362 						node_flow[node]--;
363 					}
364 				}
365 				else
366 				{
367 					if (prev_prime.is_bottom && !node_prime.is_bottom)
368 					{
369 						log_assert(true /* edge_flow[...] < ∞ */);
370 						edge_flow[{prev_prime.node, node_prime.node}]++;
371 					}
372 					else
373 					{
374 						log_assert((edge_flow[{node_prime.node, prev_prime.node}] > 0));
375 						edge_flow[{node_prime.node, prev_prime.node}]--;
376 					}
377 				}
378 				prev_prime = node_prime;
379 			}
380 
381 			node_flow[source]++;
382 			node_flow[sink]++;
383 		}
384 		return path.back() == sink_prime;
385 	}
386 
maximum_flowFlowGraph387 	int maximum_flow(int order)
388 	{
389 		int flow = 0;
390 		while (flow < order && find_augmenting_path(/*commit=*/true))
391 			flow++;
392 		return flow + find_augmenting_path(/*commit=*/false);
393 	}
394 
edge_cutFlowGraph395 	pair<pool<RTLIL::SigBit>, pool<RTLIL::SigBit>> edge_cut()
396 	{
397 		pool<RTLIL::SigBit> x = {source}, xi; // X and X̅ in the paper
398 
399 		NodePrime source_prime = {source, true};
400 		pool<NodePrime> visited;
401 		vector<NodePrime> worklist = {source_prime};
402 		while (!worklist.empty())
403 		{
404 			auto node_prime = worklist.back();
405 			worklist.pop_back();
406 			if (visited[node_prime])
407 				continue;
408 			visited.insert(node_prime);
409 
410 			if (!node_prime.is_bottom)
411 				x.insert(node_prime.node);
412 
413 			// Mincut is constructed by traversing a graph in an undirected way along forward edges that aren't full, or backward edges
414 			// that aren't empty.
415 			if (!node_prime.is_bottom) // top
416 			{
417 				if (node_flow[node_prime.node] < MAX_NODE_FLOW)
418 					worklist.push_back(node_prime.as_bottom());
419 				for (auto node_pred : edges_bw[node_prime.node])
420 					if (edge_flow[{node_pred, node_prime.node}] > 0)
421 						worklist.push_back(NodePrime::bottom(node_pred));
422 			}
423 			else // bottom
424 			{
425 				if (node_flow[node_prime.node] > 0)
426 					worklist.push_back(node_prime.as_top());
427 				for (auto node_succ : edges_fw[node_prime.node])
428 					if (true /* edge_flow[...] < ∞ */)
429 						worklist.push_back(NodePrime::top(node_succ));
430 			}
431 		}
432 
433 		for (auto node : nodes)
434 			if (!x[node])
435 				xi.insert(node);
436 
437 		for (auto collapsed_node : collapsed[sink])
438 			xi.insert(collapsed_node);
439 
440 		log_assert(x[source] && !xi[source]);
441 		log_assert(!x[sink] && xi[sink]);
442 		return {x, xi};
443 	}
444 };
445 
446 struct FlowmapWorker
447 {
448 	int order;
449 	int r_alpha, r_beta, r_gamma;
450 	bool debug, debug_relax;
451 
452 	RTLIL::Module *module;
453 	SigMap sigmap;
454 	ModIndex index;
455 
456 	dict<RTLIL::SigBit, ModIndex::PortInfo> node_origins;
457 
458 	// Gate IR
459 	pool<RTLIL::SigBit> nodes, inputs, outputs;
460 	dict<RTLIL::SigBit, pool<RTLIL::SigBit>> edges_fw, edges_bw;
461 	dict<RTLIL::SigBit, int> labels;
462 
463 	// LUT IR
464 	pool<RTLIL::SigBit> lut_nodes;
465 	dict<RTLIL::SigBit, pool<RTLIL::SigBit>> lut_gates;
466 	dict<RTLIL::SigBit, pool<RTLIL::SigBit>> lut_edges_fw, lut_edges_bw;
467 	dict<RTLIL::SigBit, int> lut_depths, lut_altitudes, lut_slacks;
468 
469 	int gate_count = 0, lut_count = 0, packed_count = 0;
470 	int gate_area = 0, lut_area = 0;
471 
472 	enum class GraphMode {
473 		Label,
474 		Cut,
475 		Slack,
476 	};
477 
dump_dot_graphFlowmapWorker478 	void dump_dot_graph(string filename, GraphMode mode,
479 	                    pool<RTLIL::SigBit> subgraph_nodes = {}, dict<RTLIL::SigBit, pool<RTLIL::SigBit>> subgraph_edges = {},
480 	                    dict<RTLIL::SigBit, pool<RTLIL::SigBit>> collapsed = {},
481 	                    pair<pool<RTLIL::SigBit>, pool<RTLIL::SigBit>> cut = {})
482 	{
483 		if (subgraph_nodes.empty())
484 			subgraph_nodes = nodes;
485 		if (subgraph_edges.empty())
486 			subgraph_edges = edges_fw;
487 
__anona209c1680502(RTLIL::SigBit node) 488 		auto node_style = [&](RTLIL::SigBit node) {
489 			string label = log_signal(node);
490 			for (auto collapsed_node : collapsed[node])
491 				if (collapsed_node != node)
492 					label += stringf(" %s", log_signal(collapsed_node));
493 			switch (mode)
494 			{
495 				case GraphMode::Label:
496 					if (labels[node] == -1)
497 					{
498 						label += "\nl=?";
499 						return GraphStyle{label};
500 					}
501 					else
502 					{
503 						label += stringf("\nl=%d", labels[node]);
504 						string fillcolor = stringf("/set311/%d", 1 + labels[node] % 11);
505 						return GraphStyle{label, "", fillcolor};
506 					}
507 
508 				case GraphMode::Cut:
509 					if (cut.first[node])
510 						return GraphStyle{label, "blue"};
511 					if (cut.second[node])
512 						return GraphStyle{label, "red"};
513 					return GraphStyle{label};
514 
515 				case GraphMode::Slack:
516 					label += stringf("\nd=%d a=%d\ns=%d", lut_depths[node], lut_altitudes[node], lut_slacks[node]);
517 					return GraphStyle{label, lut_slacks[node] == 0 ? "red" : "black"};
518 			}
519 			return GraphStyle{label};
520 		};
__anona209c1680602(RTLIL::SigBit, RTLIL::SigBit) 521 		auto edge_style = [&](RTLIL::SigBit, RTLIL::SigBit) {
522 			return GraphStyle{};
523 		};
524 		::dump_dot_graph(filename, subgraph_nodes, subgraph_edges, inputs, outputs, node_style, edge_style, module->name.str());
525 	}
526 
dump_dot_lut_graphFlowmapWorker527 	void dump_dot_lut_graph(string filename, GraphMode mode)
528 	{
529 		pool<RTLIL::SigBit> lut_and_input_nodes;
530 		lut_and_input_nodes.insert(lut_nodes.begin(), lut_nodes.end());
531 		lut_and_input_nodes.insert(inputs.begin(), inputs.end());
532 		dump_dot_graph(filename, mode, lut_and_input_nodes, lut_edges_fw, lut_gates);
533 	}
534 
find_subgraphFlowmapWorker535 	pool<RTLIL::SigBit> find_subgraph(RTLIL::SigBit sink)
536 	{
537 		pool<RTLIL::SigBit> subgraph;
538 		pool<RTLIL::SigBit> worklist = {sink};
539 		while (!worklist.empty())
540 		{
541 			auto node = worklist.pop();
542 			subgraph.insert(node);
543 			for (auto source : edges_bw[node])
544 			{
545 				if (!subgraph[source])
546 					worklist.insert(source);
547 			}
548 		}
549 		return subgraph;
550 	}
551 
build_flow_graphFlowmapWorker552 	FlowGraph build_flow_graph(RTLIL::SigBit sink, int p)
553 	{
554 		FlowGraph flow_graph;
555 		flow_graph.sink = sink;
556 
557 		pool<RTLIL::SigBit> worklist = {sink}, visited;
558 		while (!worklist.empty())
559 		{
560 			auto node = worklist.pop();
561 			visited.insert(node);
562 
563 			auto collapsed_node = labels[node] == p ? sink : node;
564 			if (node != collapsed_node)
565 				flow_graph.collapsed[collapsed_node].insert(node);
566 			flow_graph.nodes.insert(collapsed_node);
567 
568 			for (auto node_pred : edges_bw[node])
569 			{
570 				auto collapsed_node_pred = labels[node_pred] == p ? sink : node_pred;
571 				if (node_pred != collapsed_node_pred)
572 					flow_graph.collapsed[collapsed_node_pred].insert(node_pred);
573 				if (collapsed_node != collapsed_node_pred)
574 				{
575 					flow_graph.edges_bw[collapsed_node].insert(collapsed_node_pred);
576 					flow_graph.edges_fw[collapsed_node_pred].insert(collapsed_node);
577 				}
578 				if (inputs[node_pred])
579 				{
580 					flow_graph.edges_bw[collapsed_node_pred].insert(flow_graph.source);
581 					flow_graph.edges_fw[flow_graph.source].insert(collapsed_node_pred);
582 				}
583 
584 				if (!visited[node_pred])
585 					worklist.insert(node_pred);
586 			}
587 		}
588 		return flow_graph;
589 	}
590 
discover_nodesFlowmapWorker591 	void discover_nodes(pool<IdString> cell_types)
592 	{
593 		for (auto cell : module->selected_cells())
594 		{
595 			if (!cell_types[cell->type])
596 				continue;
597 
598 			if (!cell->known())
599 				log_error("Cell %s (%s.%s) is unknown.\n", cell->type.c_str(), log_id(module), log_id(cell));
600 
601 			pool<RTLIL::SigBit> fanout;
602 			for (auto conn : cell->connections())
603 			{
604 				if (!cell->output(conn.first)) continue;
605 				int offset = -1;
606 				for (auto bit : conn.second)
607 				{
608 					offset++;
609 					if (!bit.wire) continue;
610 					auto mapped_bit = sigmap(bit);
611 					if (nodes[mapped_bit])
612 						log_error("Multiple drivers found for wire %s.\n", log_signal(mapped_bit));
613 					nodes.insert(mapped_bit);
614 					node_origins[mapped_bit] = ModIndex::PortInfo(cell, conn.first, offset);
615 					fanout.insert(mapped_bit);
616 				}
617 			}
618 
619 			int fanin = 0;
620 			for (auto conn : cell->connections())
621 			{
622 				if (!cell->input(conn.first)) continue;
623 				for (auto bit : sigmap(conn.second))
624 				{
625 					if (!bit.wire) continue;
626 					for (auto fanout_bit : fanout)
627 					{
628 						edges_fw[bit].insert(fanout_bit);
629 						edges_bw[fanout_bit].insert(bit);
630 					}
631 					fanin++;
632 				}
633 			}
634 
635 			if (fanin > order)
636 				log_error("Cell %s (%s.%s) with fan-in %d cannot be mapped to a %d-LUT.\n",
637 				          cell->type.c_str(), log_id(module), log_id(cell), fanin, order);
638 
639 			gate_count++;
640 			gate_area += 1 << fanin;
641 		}
642 
643 		for (auto edge : edges_fw)
644 		{
645 			if (!nodes[edge.first])
646 			{
647 				inputs.insert(edge.first);
648 				nodes.insert(edge.first);
649 			}
650 		}
651 
652 		for (auto node : nodes)
653 		{
654 			auto node_info = index.query(node);
655 			if (node_info->is_output && !inputs[node])
656 				outputs.insert(node);
657 			for (auto port : node_info->ports)
658 				if (!cell_types[port.cell->type] && !inputs[node])
659 					outputs.insert(node);
660 		}
661 
662 		if (debug)
663 		{
664 			dump_dot_graph("flowmap-initial.dot", GraphMode::Label);
665 			log("Dumped initial graph to `flowmap-initial.dot`.\n");
666 		}
667 	}
668 
label_nodesFlowmapWorker669 	void label_nodes()
670 	{
671 		for (auto node : nodes)
672 			labels[node] = -1;
673 		for (auto input : inputs)
674 		{
675 			if (input.wire->attributes.count(ID($flowmap_level)))
676 				labels[input] = input.wire->attributes[ID($flowmap_level)].as_int();
677 			else
678 				labels[input] = 0;
679 		}
680 
681 		pool<RTLIL::SigBit> worklist = nodes;
682 		int debug_num = 0;
683 		while (!worklist.empty())
684 		{
685 			auto sink = worklist.pop();
686 			if (labels[sink] != -1)
687 				continue;
688 
689 			bool inputs_have_labels = true;
690 			for (auto sink_input : edges_bw[sink])
691 			{
692 				if (labels[sink_input] == -1)
693 				{
694 					inputs_have_labels = false;
695 					break;
696 				}
697 			}
698 			if (!inputs_have_labels)
699 				continue;
700 
701 			if (debug)
702 			{
703 				debug_num++;
704 				log("Examining subgraph %d rooted in %s.\n", debug_num, log_signal(sink));
705 			}
706 
707 			pool<RTLIL::SigBit> subgraph = find_subgraph(sink);
708 
709 			int p = 1;
710 			for (auto subgraph_node : subgraph)
711 				p = max(p, labels[subgraph_node]);
712 
713 			FlowGraph flow_graph = build_flow_graph(sink, p);
714 			int flow = flow_graph.maximum_flow(order);
715 			pool<RTLIL::SigBit> x, xi;
716 			if (flow <= order)
717 			{
718 				labels[sink] = p;
719 				auto cut = flow_graph.edge_cut();
720 				x = cut.first;
721 				xi = cut.second;
722 			}
723 			else
724 			{
725 				labels[sink] = p + 1;
726 				x = subgraph;
727 				x.erase(sink);
728 				xi.insert(sink);
729 			}
730 			lut_gates[sink] = xi;
731 
732 			pool<RTLIL::SigBit> k;
733 			for (auto xi_node : xi)
734 			{
735 				for (auto xi_node_pred : edges_bw[xi_node])
736 					if (x[xi_node_pred])
737 						k.insert(xi_node_pred);
738 			}
739 			log_assert((int)k.size() <= order);
740 			lut_edges_bw[sink] = k;
741 			for (auto k_node : k)
742 				lut_edges_fw[k_node].insert(sink);
743 
744 			if (debug)
745 			{
746 				log("  Maximum flow: %d. Assigned label %d.\n", flow, labels[sink]);
747 				dump_dot_graph(stringf("flowmap-%d-sub.dot", debug_num), GraphMode::Cut, subgraph, {}, {}, {x, xi});
748 				log("  Dumped subgraph to `flowmap-%d-sub.dot`.\n", debug_num);
749 				flow_graph.dump_dot_graph(stringf("flowmap-%d-flow.dot", debug_num));
750 				log("  Dumped flow graph to `flowmap-%d-flow.dot`.\n", debug_num);
751 				log("    LUT inputs:");
752 				for (auto k_node : k)
753 					log(" %s", log_signal(k_node));
754 				log(".\n");
755 				log("    LUT packed gates:");
756 				for (auto xi_node : xi)
757 					log(" %s", log_signal(xi_node));
758 				log(".\n");
759 			}
760 
761 			for (auto sink_succ : edges_fw[sink])
762 				worklist.insert(sink_succ);
763 		}
764 
765 		if (debug)
766 		{
767 			dump_dot_graph("flowmap-labeled.dot", GraphMode::Label);
768 			log("Dumped labeled graph to `flowmap-labeled.dot`.\n");
769 		}
770 	}
771 
map_lutsFlowmapWorker772 	int map_luts()
773 	{
774 		pool<RTLIL::SigBit> worklist = outputs;
775 		while (!worklist.empty())
776 		{
777 			auto lut_node = worklist.pop();
778 			lut_nodes.insert(lut_node);
779 			for (auto input_node : lut_edges_bw[lut_node])
780 				if (!lut_nodes[input_node] && !inputs[input_node])
781 					worklist.insert(input_node);
782 		}
783 
784 		int depth = 0;
785 		for (auto label : labels)
786 			depth = max(depth, label.second);
787 		log("Mapped to %d LUTs with maximum depth %d.\n", GetSize(lut_nodes), depth);
788 
789 		if (debug)
790 		{
791 			dump_dot_lut_graph("flowmap-mapped.dot", GraphMode::Label);
792 			log("Dumped mapped graph to `flowmap-mapped.dot`.\n");
793 		}
794 
795 		return depth;
796 	}
797 
realize_derealize_lutFlowmapWorker798 	void realize_derealize_lut(RTLIL::SigBit lut, pool<RTLIL::SigBit> *changed = nullptr)
799 	{
800 		pool<RTLIL::SigBit> worklist = {lut};
801 		while (!worklist.empty())
802 		{
803 			auto lut = worklist.pop();
804 			if (inputs[lut])
805 				continue;
806 
807 			bool realized_successors = false;
808 			for (auto lut_succ : lut_edges_fw[lut])
809 				if (lut_nodes[lut_succ])
810 					realized_successors = true;
811 
812 			if (realized_successors && !lut_nodes[lut])
813 				lut_nodes.insert(lut);
814 			else if (!realized_successors && lut_nodes[lut])
815 				lut_nodes.erase(lut);
816 			else
817 				continue;
818 
819 			for (auto lut_pred : lut_edges_bw[lut])
820 				worklist.insert(lut_pred);
821 
822 			if (changed)
823 				changed->insert(lut);
824 		}
825 	}
826 
add_lut_edgeFlowmapWorker827 	void add_lut_edge(RTLIL::SigBit pred, RTLIL::SigBit succ, pool<RTLIL::SigBit> *changed = nullptr)
828 	{
829 		log_assert(!lut_edges_fw[pred][succ] && !lut_edges_bw[succ][pred]);
830 		log_assert((int)lut_edges_bw[succ].size() < order);
831 
832 		lut_edges_fw[pred].insert(succ);
833 		lut_edges_bw[succ].insert(pred);
834 		realize_derealize_lut(pred, changed);
835 
836 		if (changed)
837 		{
838 			changed->insert(pred);
839 			changed->insert(succ);
840 		}
841 	}
842 
remove_lut_edgeFlowmapWorker843 	void remove_lut_edge(RTLIL::SigBit pred, RTLIL::SigBit succ, pool<RTLIL::SigBit> *changed = nullptr)
844 	{
845 		log_assert(lut_edges_fw[pred][succ] && lut_edges_bw[succ][pred]);
846 
847 		lut_edges_fw[pred].erase(succ);
848 		lut_edges_bw[succ].erase(pred);
849 		realize_derealize_lut(pred, changed);
850 
851 		if (changed)
852 		{
853 			if (lut_nodes[pred])
854 				changed->insert(pred);
855 			changed->insert(succ);
856 		}
857 	}
858 
cut_lut_at_gateFlowmapWorker859 	pair<pool<RTLIL::SigBit>, pool<RTLIL::SigBit>> cut_lut_at_gate(RTLIL::SigBit lut, RTLIL::SigBit lut_gate)
860 	{
861 		pool<RTLIL::SigBit> gate_inputs = lut_edges_bw[lut];
862 		pool<RTLIL::SigBit> other_inputs;
863 		pool<RTLIL::SigBit> worklist = {lut};
864 		while (!worklist.empty())
865 		{
866 			auto node = worklist.pop();
867 			for (auto node_pred : edges_bw[node])
868 			{
869 				if (node_pred == lut_gate)
870 					continue;
871 				if (lut_gates[lut][node_pred])
872 					worklist.insert(node_pred);
873 				else
874 				{
875 					gate_inputs.erase(node_pred);
876 					other_inputs.insert(node_pred);
877 				}
878 			}
879 		}
880 		return {gate_inputs, other_inputs};
881 	}
882 
compute_lut_distancesFlowmapWorker883 	void compute_lut_distances(dict<RTLIL::SigBit, int> &lut_distances, bool forward,
884 	                          pool<RTLIL::SigBit> initial = {}, pool<RTLIL::SigBit> *changed = nullptr)
885 	{
886 		pool<RTLIL::SigBit> terminals = forward ? inputs : outputs;
887 		auto &lut_edges_next = forward ? lut_edges_fw : lut_edges_bw;
888 		auto &lut_edges_prev = forward ? lut_edges_bw : lut_edges_fw;
889 
890 		if (initial.empty())
891 			initial = terminals;
892 		for (auto node : initial)
893 			lut_distances.erase(node);
894 
895 		pool<RTLIL::SigBit> worklist = initial;
896 		while (!worklist.empty())
897 		{
898 			auto lut = worklist.pop();
899 			int lut_distance = 0;
900 			if (forward && inputs[lut])
901 				lut_distance = labels[lut]; // to support (* $flowmap_level=n *)
902 			for (auto lut_prev : lut_edges_prev[lut])
903 				if ((lut_nodes[lut_prev] || inputs[lut_prev]) && lut_distances.count(lut_prev))
904 					lut_distance = max(lut_distance, lut_distances[lut_prev] + 1);
905 			if (!lut_distances.count(lut) || lut_distances[lut] != lut_distance)
906 			{
907 				lut_distances[lut] = lut_distance;
908 				if (changed != nullptr && !inputs[lut])
909 					changed->insert(lut);
910 				for (auto lut_next : lut_edges_next[lut])
911 					if (lut_nodes[lut_next] || inputs[lut_next])
912 						worklist.insert(lut_next);
913 			}
914 		}
915 	}
916 
check_lut_distancesFlowmapWorker917 	void check_lut_distances(const dict<RTLIL::SigBit, int> &lut_distances, bool forward)
918 	{
919 		dict<RTLIL::SigBit, int> gold_lut_distances;
920 		compute_lut_distances(gold_lut_distances, forward);
921 		for (auto lut_distance : lut_distances)
922 			if (lut_nodes[lut_distance.first])
923 				log_assert(lut_distance.second == gold_lut_distances[lut_distance.first]);
924 	}
925 
926 	// LUT depth is the length of the longest path from any input in LUT fan-in to LUT.
927 	// LUT altitude (for lack of a better term) is the length of the longest path from LUT to any output in LUT fan-out.
update_lut_depths_altitudesFlowmapWorker928 	void update_lut_depths_altitudes(pool<RTLIL::SigBit> worklist = {}, pool<RTLIL::SigBit> *changed = nullptr)
929 	{
930 		compute_lut_distances(lut_depths, /*forward=*/true, worklist, changed);
931 		compute_lut_distances(lut_altitudes, /*forward=*/false, worklist, changed);
932 		if (debug_relax && !worklist.empty()) {
933 			check_lut_distances(lut_depths, /*forward=*/true);
934 			check_lut_distances(lut_altitudes, /*forward=*/false);
935 		}
936 	}
937 
938 	// LUT critical output set is the set of outputs whose depth will increase (equivalently, slack will decrease) if the depth of
939 	// the LUT increases. (This is referred to as RPOv for LUTv in the paper.)
compute_lut_critical_outputsFlowmapWorker940 	void compute_lut_critical_outputs(dict<RTLIL::SigBit, pool<RTLIL::SigBit>> &lut_critical_outputs,
941 	                                  pool<RTLIL::SigBit> worklist = {})
942 	{
943 		if (worklist.empty())
944 			worklist = lut_nodes;
945 
946 		while (!worklist.empty())
947 		{
948 			bool updated_some = false;
949 			for (auto lut : worklist)
950 			{
951 				if (outputs[lut])
952 					lut_critical_outputs[lut] = {lut};
953 				else
954 				{
955 					bool all_succ_computed = true;
956 					lut_critical_outputs[lut] = {};
957 					for (auto lut_succ : lut_edges_fw[lut])
958 					{
959 						if (lut_nodes[lut_succ] && lut_depths[lut_succ] == lut_depths[lut] + 1)
960 						{
961 							if (lut_critical_outputs.count(lut_succ))
962 								lut_critical_outputs[lut].insert(lut_critical_outputs[lut_succ].begin(), lut_critical_outputs[lut_succ].end());
963 							else
964 							{
965 								all_succ_computed = false;
966 								break;
967 							}
968 						}
969 					}
970 					if (!all_succ_computed)
971 					{
972 						lut_critical_outputs.erase(lut);
973 						continue;
974 					}
975 				}
976 				worklist.erase(lut);
977 				updated_some = true;
978 			}
979 			log_assert(updated_some);
980 		}
981 	}
982 
983 	// Invalidating LUT critical output sets is tricky, because increasing the depth of a LUT may take other, adjacent LUTs off the critical
984 	// path to the output. Conservatively, if we increase depth of some LUT, every LUT in its input cone needs to have its critical output
985 	// set invalidated, too.
invalidate_lut_critical_outputsFlowmapWorker986 	pool<RTLIL::SigBit> invalidate_lut_critical_outputs(dict<RTLIL::SigBit, pool<RTLIL::SigBit>> &lut_critical_outputs,
987 	                                                    pool<RTLIL::SigBit> worklist)
988 	{
989 		pool<RTLIL::SigBit> changed;
990 		while (!worklist.empty())
991 		{
992 			auto lut = worklist.pop();
993 			changed.insert(lut);
994 			lut_critical_outputs.erase(lut);
995 			for (auto lut_pred : lut_edges_bw[lut])
996 			{
997 				if (lut_nodes[lut_pred] && !changed[lut_pred])
998 				{
999 					changed.insert(lut_pred);
1000 					worklist.insert(lut_pred);
1001 				}
1002 			}
1003 		}
1004 		return changed;
1005 	}
1006 
check_lut_critical_outputsFlowmapWorker1007 	void check_lut_critical_outputs(const dict<RTLIL::SigBit, pool<RTLIL::SigBit>> &lut_critical_outputs)
1008 	{
1009 		dict<RTLIL::SigBit, pool<RTLIL::SigBit>> gold_lut_critical_outputs;
1010 		compute_lut_critical_outputs(gold_lut_critical_outputs);
1011 		for (auto lut_critical_output : lut_critical_outputs)
1012 			if (lut_nodes[lut_critical_output.first])
1013 				log_assert(lut_critical_output.second == gold_lut_critical_outputs[lut_critical_output.first]);
1014 	}
1015 
update_lut_critical_outputsFlowmapWorker1016 	void update_lut_critical_outputs(dict<RTLIL::SigBit, pool<RTLIL::SigBit>> &lut_critical_outputs,
1017 	                                 pool<RTLIL::SigBit> worklist = {})
1018 	{
1019 		if (!worklist.empty())
1020 		{
1021 			pool<RTLIL::SigBit> invalidated = invalidate_lut_critical_outputs(lut_critical_outputs, worklist);
1022 			compute_lut_critical_outputs(lut_critical_outputs, invalidated);
1023 			check_lut_critical_outputs(lut_critical_outputs);
1024 		}
1025 		else
1026 			compute_lut_critical_outputs(lut_critical_outputs);
1027 	}
1028 
update_breaking_node_potentialsFlowmapWorker1029 	void update_breaking_node_potentials(dict<RTLIL::SigBit, dict<RTLIL::SigBit, int>> &potentials,
1030 	                                     const dict<RTLIL::SigBit, pool<RTLIL::SigBit>> &lut_critical_outputs)
1031 	{
1032 		for (auto lut : lut_nodes)
1033 		{
1034 			if (potentials.count(lut))
1035 				continue;
1036 			if (lut_gates[lut].size() == 1 || lut_slacks[lut] == 0)
1037 				continue;
1038 
1039 			if (debug_relax)
1040 				log("  Computing potentials for LUT %s.\n", log_signal(lut));
1041 
1042 			for (auto lut_gate : lut_gates[lut])
1043 			{
1044 				if (lut == lut_gate)
1045 					continue;
1046 
1047 				if (debug_relax)
1048 					log("    Considering breaking node %s.\n", log_signal(lut_gate));
1049 
1050 				int r_ex, r_im, r_slk;
1051 
1052 				auto cut_inputs = cut_lut_at_gate(lut, lut_gate);
1053 				pool<RTLIL::SigBit> gate_inputs = cut_inputs.first, other_inputs = cut_inputs.second;
1054 				if (gate_inputs.empty() && (int)other_inputs.size() >= order)
1055 				{
1056 					if (debug_relax)
1057 						log("      Breaking would result in a (k+1)-LUT.\n");
1058 					continue;
1059 				}
1060 
1061 				pool<RTLIL::SigBit> elim_fanin_luts;
1062 				for (auto gate_input : gate_inputs)
1063 				{
1064 					if (lut_edges_fw[gate_input].size() == 1)
1065 					{
1066 						log_assert(lut_edges_fw[gate_input][lut]);
1067 						elim_fanin_luts.insert(gate_input);
1068 					}
1069 				}
1070 				if (debug_relax)
1071 				{
1072 					if (!lut_nodes[lut_gate])
1073 						log("      Breaking requires a new LUT.\n");
1074 					if (!gate_inputs.empty())
1075 					{
1076 						log("      Breaking eliminates LUT inputs");
1077 						for (auto gate_input : gate_inputs)
1078 							log(" %s", log_signal(gate_input));
1079 						log(".\n");
1080 					}
1081 					if (!elim_fanin_luts.empty())
1082 					{
1083 						log("      Breaking eliminates fan-in LUTs");
1084 						for (auto elim_fanin_lut : elim_fanin_luts)
1085 							log(" %s", log_signal(elim_fanin_lut));
1086 						log(".\n");
1087 					}
1088 				}
1089 				r_ex = (lut_nodes[lut_gate] ? 0 : -1) + elim_fanin_luts.size();
1090 
1091 				pool<pair<RTLIL::SigBit, RTLIL::SigBit>> maybe_mergeable_luts;
1092 
1093 				// Try to merge LUTv with one of its successors.
1094 				RTLIL::SigBit last_lut_succ;
1095 				int fanout = 0;
1096 				for (auto lut_succ : lut_edges_fw[lut])
1097 				{
1098 					if (lut_nodes[lut_succ])
1099 					{
1100 						fanout++;
1101 						last_lut_succ = lut_succ;
1102 					}
1103 				}
1104 				if (fanout == 1)
1105 					maybe_mergeable_luts.insert({lut, last_lut_succ});
1106 
1107 				// Try to merge LUTv with one of its predecessors.
1108 				for (auto lut_pred : other_inputs)
1109 				{
1110 					int fanout = 0;
1111 					for (auto lut_pred_succ : lut_edges_fw[lut_pred])
1112 						if (lut_nodes[lut_pred_succ] || lut_pred_succ == lut_gate)
1113 							fanout++;
1114 					if (fanout == 1)
1115 						maybe_mergeable_luts.insert({lut_pred, lut});
1116 				}
1117 
1118 				// Try to merge LUTw with one of its predecessors.
1119 				for (auto lut_gate_pred : lut_edges_bw[lut_gate])
1120 				{
1121 					int fanout = 0;
1122 					for (auto lut_gate_pred_succ : lut_edges_fw[lut_gate_pred])
1123 						if (lut_nodes[lut_gate_pred_succ] || lut_gate_pred_succ == lut_gate)
1124 							fanout++;
1125 					if (fanout == 1)
1126 						maybe_mergeable_luts.insert({lut_gate_pred, lut_gate});
1127 				}
1128 
1129 				r_im = 0;
1130 				for (auto maybe_mergeable_pair : maybe_mergeable_luts)
1131 				{
1132 					log_assert(lut_edges_fw[maybe_mergeable_pair.first][maybe_mergeable_pair.second]);
1133 					pool<RTLIL::SigBit> unique_inputs;
1134 					for (auto fst_lut_pred : lut_edges_bw[maybe_mergeable_pair.first])
1135 						if (lut_nodes[fst_lut_pred])
1136 							unique_inputs.insert(fst_lut_pred);
1137 					for (auto snd_lut_pred : lut_edges_bw[maybe_mergeable_pair.second])
1138 						if (lut_nodes[snd_lut_pred])
1139 							unique_inputs.insert(snd_lut_pred);
1140 					unique_inputs.erase(maybe_mergeable_pair.first);
1141 					if ((int)unique_inputs.size() <= order)
1142 					{
1143 						if (debug_relax)
1144 							log("      Breaking may allow merging %s and %s.\n",
1145 							    log_signal(maybe_mergeable_pair.first), log_signal(maybe_mergeable_pair.second));
1146 						r_im++;
1147 					}
1148 				}
1149 
1150 				int lut_gate_depth;
1151 				if (lut_nodes[lut_gate])
1152 					lut_gate_depth = lut_depths[lut_gate];
1153 				else
1154 				{
1155 					lut_gate_depth = 0;
1156 					for (auto lut_gate_pred : lut_edges_bw[lut_gate])
1157 						lut_gate_depth = max(lut_gate_depth, lut_depths[lut_gate_pred] + 1);
1158 				}
1159 				if (lut_depths[lut] >= lut_gate_depth + 1)
1160 					r_slk = 0;
1161 				else
1162 				{
1163 					int depth_delta = lut_gate_depth + 1 - lut_depths[lut];
1164 					if (depth_delta > lut_slacks[lut])
1165 					{
1166 						if (debug_relax)
1167 							log("      Breaking would increase depth by %d, which is more than available slack.\n", depth_delta);
1168 						continue;
1169 					}
1170 
1171 					if (debug_relax)
1172 					{
1173 						log("      Breaking increases depth of LUT by %d.\n", depth_delta);
1174 						if (lut_critical_outputs.at(lut).size())
1175 						{
1176 							log("      Breaking decreases slack of outputs");
1177 							for (auto lut_critical_output : lut_critical_outputs.at(lut))
1178 							{
1179 								log(" %s", log_signal(lut_critical_output));
1180 								log_assert(lut_slacks[lut_critical_output] > 0);
1181 							}
1182 							log(".\n");
1183 						}
1184 					}
1185 					r_slk = lut_critical_outputs.at(lut).size() * depth_delta;
1186 				}
1187 
1188 				int p = 100 * (r_alpha * r_ex + r_beta * r_im + r_gamma) / (r_slk + 1);
1189 				if (debug_relax)
1190 					log("    Potential for breaking node %s: %d (Rex=%d, Rim=%d, Rslk=%d).\n",
1191 					    log_signal(lut_gate), p, r_ex, r_im, r_slk);
1192 				potentials[lut][lut_gate] = p;
1193 			}
1194 		}
1195 	}
1196 
relax_depth_for_boundFlowmapWorker1197 	bool relax_depth_for_bound(bool first, int depth_bound, dict<RTLIL::SigBit, pool<RTLIL::SigBit>> &lut_critical_outputs)
1198 	{
1199 		int initial_count = GetSize(lut_nodes);
1200 
1201 		for (auto node : lut_nodes)
1202 		{
1203 			lut_slacks[node] = depth_bound - (lut_depths[node] + lut_altitudes[node]);
1204 			log_assert(lut_slacks[node] >= 0);
1205 		}
1206 		if (debug)
1207 		{
1208 			dump_dot_lut_graph(stringf("flowmap-relax-%d-initial.dot", depth_bound), GraphMode::Slack);
1209 			log("  Dumped initial slack graph to `flowmap-relax-%d-initial.dot`.\n", depth_bound);
1210 		}
1211 
1212 		dict<RTLIL::SigBit, dict<RTLIL::SigBit, int>> potentials;
1213 		for (int break_num = 1; ; break_num++)
1214 		{
1215 			update_breaking_node_potentials(potentials, lut_critical_outputs);
1216 
1217 			if (potentials.empty())
1218 			{
1219 				log("  Relaxed to %d (+%d) LUTs.\n", GetSize(lut_nodes), GetSize(lut_nodes) - initial_count);
1220 				if (!first && break_num == 1)
1221 				{
1222 					log("  Design fully relaxed.\n");
1223 					return true;
1224 				}
1225 				else
1226 				{
1227 					log("  Slack exhausted.\n");
1228 					break;
1229 				}
1230 			}
1231 
1232 			RTLIL::SigBit breaking_lut, breaking_gate;
1233 			int best_potential = INT_MIN;
1234 			for (auto lut_gate_potentials : potentials)
1235 			{
1236 				for (auto gate_potential : lut_gate_potentials.second)
1237 				{
1238 					if (gate_potential.second > best_potential)
1239 					{
1240 						breaking_lut = lut_gate_potentials.first;
1241 						breaking_gate = gate_potential.first;
1242 						best_potential = gate_potential.second;
1243 					}
1244 				}
1245 			}
1246 			log("  Breaking LUT %s to %s LUT %s (potential %d).\n",
1247 			    log_signal(breaking_lut), lut_nodes[breaking_gate] ? "reuse" : "extract", log_signal(breaking_gate), best_potential);
1248 
1249 			if (debug_relax)
1250 				log("    Removing breaking gate %s from LUT.\n", log_signal(breaking_gate));
1251 			lut_gates[breaking_lut].erase(breaking_gate);
1252 
1253 			auto cut_inputs = cut_lut_at_gate(breaking_lut, breaking_gate);
1254 			pool<RTLIL::SigBit> gate_inputs = cut_inputs.first, other_inputs = cut_inputs.second;
1255 
1256 			pool<RTLIL::SigBit> worklist = lut_gates[breaking_lut];
1257 			pool<RTLIL::SigBit> elim_gates = gate_inputs;
1258 			while (!worklist.empty())
1259 			{
1260 				auto lut_gate = worklist.pop();
1261 				bool all_gate_preds_elim = true;
1262 				for (auto lut_gate_pred : edges_bw[lut_gate])
1263 					if (!elim_gates[lut_gate_pred])
1264 						all_gate_preds_elim = false;
1265 				if (all_gate_preds_elim)
1266 				{
1267 					if (debug_relax)
1268 						log("    Removing gate %s from LUT.\n", log_signal(lut_gate));
1269 					lut_gates[breaking_lut].erase(lut_gate);
1270 					for (auto lut_gate_succ : edges_fw[lut_gate])
1271 						worklist.insert(lut_gate_succ);
1272 				}
1273 			}
1274 			log_assert(!lut_gates[breaking_lut].empty());
1275 
1276 			pool<RTLIL::SigBit> directly_affected_nodes = {breaking_lut};
1277 			for (auto gate_input : gate_inputs)
1278 			{
1279 				if (debug_relax)
1280 					log("    Removing LUT edge %s -> %s.\n", log_signal(gate_input), log_signal(breaking_lut));
1281 				remove_lut_edge(gate_input, breaking_lut, &directly_affected_nodes);
1282 			}
1283 			if (debug_relax)
1284 				log("    Adding LUT edge %s -> %s.\n", log_signal(breaking_gate), log_signal(breaking_lut));
1285 			add_lut_edge(breaking_gate, breaking_lut, &directly_affected_nodes);
1286 
1287 			if (debug_relax)
1288 				log("  Updating slack and potentials.\n");
1289 
1290 			pool<RTLIL::SigBit> indirectly_affected_nodes = {};
1291 			update_lut_depths_altitudes(directly_affected_nodes, &indirectly_affected_nodes);
1292 			update_lut_critical_outputs(lut_critical_outputs, indirectly_affected_nodes);
1293 			for (auto node : indirectly_affected_nodes)
1294 			{
1295 				lut_slacks[node] = depth_bound - (lut_depths[node] + lut_altitudes[node]);
1296 				log_assert(lut_slacks[node] >= 0);
1297 				if (debug_relax)
1298 					log("    LUT %s now has depth %d and slack %d.\n", log_signal(node), lut_depths[node], lut_slacks[node]);
1299 			}
1300 
1301 			worklist = indirectly_affected_nodes;
1302 			pool<RTLIL::SigBit> visited;
1303 			while (!worklist.empty())
1304 			{
1305 				auto node = worklist.pop();
1306 				visited.insert(node);
1307 				potentials.erase(node);
1308 				// We are invalidating the entire output cone of the gate IR node, not just of the LUT IR node. This is done to also invalidate
1309 				// all LUTs that could contain one of the indirectly affected nodes as a *part* of them, as they may not be in the output cone
1310 				// of any of the LUT IR nodes, e.g. if we have a LUT IR node A and node B as predecessors of node C, where node B includes all
1311 				// gates from node A.
1312 				for (auto node_succ : edges_fw[node])
1313 					if (!visited[node_succ])
1314 						worklist.insert(node_succ);
1315 			}
1316 
1317 			if (debug)
1318 			{
1319 				dump_dot_lut_graph(stringf("flowmap-relax-%d-break-%d.dot", depth_bound, break_num), GraphMode::Slack);
1320 				log("  Dumped slack graph after break %d to `flowmap-relax-%d-break-%d.dot`.\n",  break_num, depth_bound, break_num);
1321 			}
1322 		}
1323 
1324 		return false;
1325 	}
1326 
optimize_areaFlowmapWorker1327 	void optimize_area(int depth, int optarea)
1328 	{
1329 		dict<RTLIL::SigBit, pool<RTLIL::SigBit>> lut_critical_outputs;
1330 		update_lut_depths_altitudes();
1331 		update_lut_critical_outputs(lut_critical_outputs);
1332 
1333 		for (int depth_bound = depth; depth_bound <= depth + optarea; depth_bound++)
1334 		{
1335 			log("Relaxing with depth bound %d.\n", depth_bound);
1336 			bool fully_relaxed = relax_depth_for_bound(depth_bound == depth, depth_bound, lut_critical_outputs);
1337 
1338 			if (fully_relaxed)
1339 				break;
1340 		}
1341 	}
1342 
pack_cellsFlowmapWorker1343 	void pack_cells(int minlut)
1344 	{
1345 		ConstEval ce(module);
1346 		for (auto input_node : inputs)
1347 			ce.stop(input_node);
1348 
1349 		pool<RTLIL::SigBit> mapped_nodes;
1350 		for (auto node : lut_nodes)
1351 		{
1352 			if (node_origins.count(node))
1353 			{
1354 				auto origin = node_origins[node];
1355 				if (origin.cell->getPort(origin.port).size() == 1)
1356 					log("Packing %s.%s.%s (%s).\n",
1357 					    log_id(module), log_id(origin.cell), origin.port.c_str(), log_signal(node));
1358 				else
1359 					log("Packing %s.%s.%s [%d] (%s).\n",
1360 					    log_id(module), log_id(origin.cell), origin.port.c_str(), origin.offset, log_signal(node));
1361 			}
1362 			else
1363 			{
1364 				log("Packing %s.%s.\n", log_id(module), log_signal(node));
1365 			}
1366 
1367 			for (auto gate_node : lut_gates[node])
1368 			{
1369 				log_assert(node_origins.count(gate_node));
1370 
1371 				if (gate_node == node)
1372 					continue;
1373 
1374 				auto gate_origin = node_origins[gate_node];
1375 				if (gate_origin.cell->getPort(gate_origin.port).size() == 1)
1376 					log("  Packing %s.%s.%s (%s).\n",
1377 					    log_id(module), log_id(gate_origin.cell), gate_origin.port.c_str(), log_signal(gate_node));
1378 				else
1379 					log("  Packing %s.%s.%s [%d] (%s).\n",
1380 					    log_id(module), log_id(gate_origin.cell), gate_origin.port.c_str(), gate_origin.offset, log_signal(gate_node));
1381 			}
1382 
1383 			vector<RTLIL::SigBit> input_nodes(lut_edges_bw[node].begin(), lut_edges_bw[node].end());
1384 			RTLIL::Const lut_table(State::Sx, max(1 << input_nodes.size(), 1 << minlut));
1385 			unsigned const mask = 1 << input_nodes.size();
1386 			for (unsigned i = 0; i < mask; i++)
1387 			{
1388 				ce.push();
1389 				for (size_t n = 0; n < input_nodes.size(); n++)
1390 					ce.set(input_nodes[n], ((i >> n) & 1) ? State::S1 : State::S0);
1391 
1392 				RTLIL::SigSpec value = node, undef;
1393 				if (!ce.eval(value, undef))
1394 				{
1395 					string env;
1396 					for (auto input_node : input_nodes)
1397 						env += stringf("  %s = %s\n", log_signal(input_node), log_signal(ce.values_map(input_node)));
1398 					log_error("Cannot evaluate %s because %s is not defined.\nEvaluation environment:\n%s",
1399 					          log_signal(node), log_signal(undef), env.c_str());
1400 				}
1401 
1402 				lut_table[i] = value.as_bool() ? State::S1 : State::S0;
1403 				ce.pop();
1404 			}
1405 
1406 			RTLIL::SigSpec lut_a, lut_y = node;
1407 			for (auto input_node : input_nodes)
1408 				lut_a.append(input_node);
1409 			lut_a.append(RTLIL::Const(State::Sx, minlut - input_nodes.size()));
1410 
1411 			RTLIL::Cell *lut = module->addLut(NEW_ID, lut_a, lut_y, lut_table);
1412 			mapped_nodes.insert(node);
1413 			for (auto gate_node : lut_gates[node])
1414 			{
1415 				auto gate_origin = node_origins[gate_node];
1416 				lut->add_strpool_attribute(ID::src, gate_origin.cell->get_strpool_attribute(ID::src));
1417 				packed_count++;
1418 			}
1419 			lut_count++;
1420 			lut_area += lut_table.size();
1421 
1422 			if ((int)input_nodes.size() >= minlut)
1423 				log("  Packed into a %d-LUT %s.%s.\n", GetSize(input_nodes), log_id(module), log_id(lut));
1424 			else
1425 				log("  Packed into a %d-LUT %s.%s (implemented as %d-LUT).\n", GetSize(input_nodes), log_id(module), log_id(lut), minlut);
1426 		}
1427 
1428 		for (auto node : mapped_nodes)
1429 		{
1430 			auto origin = node_origins[node];
1431 			RTLIL::SigSpec driver = origin.cell->getPort(origin.port);
1432 			driver[origin.offset] = module->addWire(NEW_ID);
1433 			origin.cell->setPort(origin.port, driver);
1434 		}
1435 	}
1436 
FlowmapWorkerFlowmapWorker1437 	FlowmapWorker(int order, int minlut, pool<IdString> cell_types, int r_alpha, int r_beta, int r_gamma,
1438 	              bool relax, int optarea, bool debug, bool debug_relax,
1439 	              RTLIL::Module *module) :
1440 		order(order), r_alpha(r_alpha), r_beta(r_beta), r_gamma(r_gamma), debug(debug), debug_relax(debug_relax),
1441 		module(module), sigmap(module), index(module)
1442 	{
1443 		log("Labeling cells.\n");
1444 		discover_nodes(cell_types);
1445 		label_nodes();
1446 		int depth = map_luts();
1447 
1448 		if (relax)
1449 		{
1450 			log("\n");
1451 			log("Optimizing area.\n");
1452 			optimize_area(depth, optarea);
1453 		}
1454 
1455 		log("\n");
1456 		log("Packing cells.\n");
1457 		pack_cells(minlut);
1458 	}
1459 };
1460 
split(std::vector<std::string> & tokens,const std::string & text,char sep)1461 static void split(std::vector<std::string> &tokens, const std::string &text, char sep)
1462 {
1463 	size_t start = 0, end = 0;
1464 	while ((end = text.find(sep, start)) != std::string::npos) {
1465 		tokens.push_back(text.substr(start, end - start));
1466 		start = end + 1;
1467 	}
1468 	tokens.push_back(text.substr(start));
1469 }
1470 
1471 struct FlowmapPass : public Pass {
FlowmapPassFlowmapPass1472 	FlowmapPass() : Pass("flowmap", "pack LUTs with FlowMap") { }
helpFlowmapPass1473 	void help() override
1474 	{
1475 		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
1476 		log("\n");
1477 		log("    flowmap [options] [selection]\n");
1478 		log("\n");
1479 		log("This pass uses the FlowMap technology mapping algorithm to pack logic gates\n");
1480 		log("into k-LUTs with optimal depth. It allows mapping any circuit elements that can\n");
1481 		log("be evaluated with the `eval` pass, including cells with multiple output ports\n");
1482 		log("and multi-bit input and output ports.\n");
1483 		log("\n");
1484 		log("    -maxlut k\n");
1485 		log("        perform technology mapping for a k-LUT architecture. if not specified,\n");
1486 		log("        defaults to 3.\n");
1487 		log("\n");
1488 		log("    -minlut n\n");
1489 		log("        only produce n-input or larger LUTs. if not specified, defaults to 1.\n");
1490 		log("\n");
1491 		log("    -cells <cell>[,<cell>,...]\n");
1492 		log("        map specified cells. if not specified, maps $_NOT_, $_AND_, $_OR_,\n");
1493 		log("        $_XOR_ and $_MUX_, which are the outputs of the `simplemap` pass.\n");
1494 		log("\n");
1495 		log("    -relax\n");
1496 		log("        perform depth relaxation and area minimization.\n");
1497 		log("\n");
1498 		log("    -r-alpha n, -r-beta n, -r-gamma n\n");
1499 		log("        parameters of depth relaxation heuristic potential function.\n");
1500 		log("        if not specified, alpha=8, beta=2, gamma=1.\n");
1501 		log("\n");
1502 		log("    -optarea n\n");
1503 		log("        optimize for area by trading off at most n logic levels for fewer LUTs.\n");
1504 		log("        n may be zero, to optimize for area without increasing depth.\n");
1505 		log("        implies -relax.\n");
1506 		log("\n");
1507 		log("    -debug\n");
1508 		log("        dump intermediate graphs.\n");
1509 		log("\n");
1510 		log("    -debug-relax\n");
1511 		log("        explain decisions performed during depth relaxation.\n");
1512 		log("\n");
1513 	}
executeFlowmapPass1514 	void execute(std::vector<std::string> args, RTLIL::Design *design) override
1515 	{
1516 		int order = 3;
1517 		int minlut = 1;
1518 		vector<string> cells;
1519 		bool relax = false;
1520 		int r_alpha = 8, r_beta = 2, r_gamma = 1;
1521 		int optarea = 0;
1522 		bool debug = false, debug_relax = false;
1523 
1524 		size_t argidx;
1525 		for (argidx = 1; argidx < args.size(); argidx++)
1526 		{
1527 			if (args[argidx] == "-maxlut" && argidx + 1 < args.size())
1528 			{
1529 				order = atoi(args[++argidx].c_str());
1530 				continue;
1531 			}
1532 			if (args[argidx] == "-minlut" && argidx + 1 < args.size())
1533 			{
1534 				minlut = atoi(args[++argidx].c_str());
1535 				continue;
1536 			}
1537 			if (args[argidx] == "-cells" && argidx + 1 < args.size())
1538 			{
1539 				split(cells, args[++argidx], ',');
1540 				continue;
1541 			}
1542 			if (args[argidx] == "-relax")
1543 			{
1544 				relax = true;
1545 				continue;
1546 			}
1547 			if (args[argidx] == "-r-alpha" && argidx + 1 < args.size())
1548 			{
1549 				r_alpha = atoi(args[++argidx].c_str());
1550 				continue;
1551 			}
1552 			if (args[argidx] == "-r-beta" && argidx + 1 < args.size())
1553 			{
1554 				r_beta = atoi(args[++argidx].c_str());
1555 				continue;
1556 			}
1557 			if (args[argidx] == "-r-gamma" && argidx + 1 < args.size())
1558 			{
1559 				r_gamma = atoi(args[++argidx].c_str());
1560 				continue;
1561 			}
1562 			if (args[argidx] == "-optarea" && argidx + 1 < args.size())
1563 			{
1564 				relax = true;
1565 				optarea = atoi(args[++argidx].c_str());
1566 				continue;
1567 			}
1568 			if (args[argidx] == "-debug")
1569 			{
1570 				debug = true;
1571 				continue;
1572 			}
1573 			if (args[argidx] == "-debug-relax")
1574 			{
1575 				debug = debug_relax = true;
1576 				continue;
1577 			}
1578 			break;
1579 		}
1580 		extra_args(args, argidx, design);
1581 
1582 		pool<IdString> cell_types;
1583 		if (!cells.empty())
1584 		{
1585 			for (auto &cell : cells)
1586 				cell_types.insert(cell);
1587 		}
1588 		else
1589 		{
1590 			cell_types = {ID($_NOT_), ID($_AND_), ID($_OR_), ID($_XOR_), ID($_MUX_)};
1591 		}
1592 
1593 		const char *algo_r = relax ? "-r" : "";
1594 		log_header(design, "Executing FLOWMAP pass (pack LUTs with FlowMap%s).\n", algo_r);
1595 
1596 		int gate_count = 0, lut_count = 0, packed_count = 0;
1597 		int gate_area = 0, lut_area = 0;
1598 		for (auto module : design->selected_modules())
1599 		{
1600 			FlowmapWorker worker(order, minlut, cell_types, r_alpha, r_beta, r_gamma, relax, optarea, debug, debug_relax, module);
1601 			gate_count += worker.gate_count;
1602 			lut_count += worker.lut_count;
1603 			packed_count += worker.packed_count;
1604 			gate_area += worker.gate_area;
1605 			lut_area += worker.lut_area;
1606 		}
1607 
1608 		log("\n");
1609 		log("Packed %d cells (%d of them duplicated) into %d LUTs.\n", packed_count, packed_count - gate_count, lut_count);
1610 		log("Solution takes %.1f%% of original gate area.\n", lut_area * 100.0 / gate_area);
1611 	}
1612 } FlowmapPass;
1613 
1614 PRIVATE_NAMESPACE_END
1615