1 /*
2  * Copyright 2019 WebAssembly Community Group participants
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "wasm-debug.h"
18 #include "wasm.h"
19 
20 #ifdef BUILD_LLVM_DWARF
21 #include "llvm/ObjectYAML/DWARFEmitter.h"
22 #include "llvm/ObjectYAML/DWARFYAML.h"
23 #include "llvm/include/llvm/DebugInfo/DWARFContext.h"
24 
25 std::error_code dwarf2yaml(llvm::DWARFContext& DCtx, llvm::DWARFYAML::Data& Y);
26 #endif
27 
28 #include "wasm-binary.h"
29 #include "wasm-debug.h"
30 #include "wasm.h"
31 
32 namespace wasm {
33 
34 namespace Debug {
35 
isDWARFSection(Name name)36 bool isDWARFSection(Name name) { return name.startsWith(".debug_"); }
37 
hasDWARFSections(const Module & wasm)38 bool hasDWARFSections(const Module& wasm) {
39   for (auto& section : wasm.userSections) {
40     if (isDWARFSection(section.name)) {
41       return true;
42     }
43   }
44   return false;
45 }
46 
47 #ifdef BUILD_LLVM_DWARF
48 
49 // In wasm32 the address size is 32 bits.
50 static const size_t AddressSize = 4;
51 
52 struct BinaryenDWARFInfo {
53   llvm::StringMap<std::unique_ptr<llvm::MemoryBuffer>> sections;
54   std::unique_ptr<llvm::DWARFContext> context;
55 
BinaryenDWARFInfowasm::Debug::BinaryenDWARFInfo56   BinaryenDWARFInfo(const Module& wasm) {
57     // Get debug sections from the wasm.
58     for (auto& section : wasm.userSections) {
59       if (Name(section.name).startsWith(".debug_") && section.data.data()) {
60         // TODO: efficiency
61         sections[section.name.substr(1)] = llvm::MemoryBuffer::getMemBufferCopy(
62           llvm::StringRef(section.data.data(), section.data.size()));
63       }
64     }
65     // Parse debug sections.
66     uint8_t addrSize = AddressSize;
67     bool isLittleEndian = true;
68     context = llvm::DWARFContext::create(sections, addrSize, isLittleEndian);
69   }
70 };
71 
dumpDWARF(const Module & wasm)72 void dumpDWARF(const Module& wasm) {
73   BinaryenDWARFInfo info(wasm);
74   std::cout << "DWARF debug info\n";
75   std::cout << "================\n\n";
76   for (auto& section : wasm.userSections) {
77     if (Name(section.name).startsWith(".debug_")) {
78       std::cout << "Contains section " << section.name << " ("
79                 << section.data.size() << " bytes)\n";
80     }
81   }
82   llvm::DIDumpOptions options;
83   options.DumpType = llvm::DIDT_All;
84   options.ShowChildren = true;
85   options.Verbose = true;
86   info.context->dump(llvm::outs(), options);
87 }
88 
89 //
90 // Big picture: We use a DWARFContext to read data, then DWARFYAML support
91 // code to write it. That is not the main LLVM Dwarf code used for writing
92 // object files, but it avoids us create a "fake" MC layer, and provides a
93 // simple way to write out the debug info. Likely the level of info represented
94 // in the DWARFYAML::Data object is sufficient for Binaryen's needs, but if not,
95 // we may need a different approach.
96 //
97 // In more detail:
98 //
99 // 1. Binary sections => DWARFContext:
100 //
101 //     llvm::DWARFContext::create(sections..)
102 //
103 // 2. DWARFContext => DWARFYAML::Data
104 //
105 //     std::error_code dwarf2yaml(DWARFContext &DCtx, DWARFYAML::Data &Y) {
106 //
107 // 3. DWARFYAML::Data => binary sections
108 //
109 //     StringMap<std::unique_ptr<MemoryBuffer>>
110 //     EmitDebugSections(llvm::DWARFYAML::Data &DI, bool ApplyFixups);
111 //
112 
113 // Represents the state when parsing a line table.
114 struct LineState {
115   uint32_t addr = 0;
116   // TODO sectionIndex?
117   uint32_t line = 1;
118   uint32_t col = 0;
119   uint32_t file = 1;
120   uint32_t isa = 0;
121   uint32_t discriminator = 0;
122   bool isStmt;
123   bool basicBlock = false;
124   bool prologueEnd = false;
125   bool epilogueBegin = false;
126   // Each instruction is part of a sequence, all of which get the same ID. The
127   // order within a sequence may change if binaryen reorders things, which means
128   // that we can't track the end_sequence location and assume it is at the end -
129   // we must track sequences and then emit an end for each one.
130   // -1 is an invalid marker value (note that this assumes we can fit all ids
131   // into just under 32 bits).
132   uint32_t sequenceId = -1;
133 
134   LineState(const LineState& other) = default;
LineStatewasm::Debug::LineState135   LineState(const llvm::DWARFYAML::LineTable& table, uint32_t sequenceId)
136     : isStmt(table.DefaultIsStmt), sequenceId(sequenceId) {}
137 
138   LineState& operator=(const LineState& other) = default;
139 
140   // Updates the state, and returns whether a new row is ready to be emitted.
updatewasm::Debug::LineState141   bool update(llvm::DWARFYAML::LineTableOpcode& opcode,
142               const llvm::DWARFYAML::LineTable& table) {
143     switch (opcode.Opcode) {
144       case 0: {
145         // Extended opcodes
146         switch (opcode.SubOpcode) {
147           case llvm::dwarf::DW_LNE_set_address: {
148             addr = opcode.Data;
149             break;
150           }
151           case llvm::dwarf::DW_LNE_end_sequence: {
152             return true;
153           }
154           case llvm::dwarf::DW_LNE_set_discriminator: {
155             discriminator = opcode.Data;
156             break;
157           }
158           case llvm::dwarf::DW_LNE_define_file: {
159             Fatal() << "TODO: DW_LNE_define_file";
160           }
161           default: {
162             // An unknown opcode, ignore.
163             std::cerr << "warning: unknown subopcopde " << opcode.SubOpcode
164                       << '\n';
165           }
166         }
167         break;
168       }
169       case llvm::dwarf::DW_LNS_set_column: {
170         col = opcode.Data;
171         break;
172       }
173       case llvm::dwarf::DW_LNS_set_prologue_end: {
174         prologueEnd = true;
175         break;
176       }
177       case llvm::dwarf::DW_LNS_copy: {
178         return true;
179       }
180       case llvm::dwarf::DW_LNS_advance_pc: {
181         assert(table.MinInstLength == 1);
182         addr += opcode.Data;
183         break;
184       }
185       case llvm::dwarf::DW_LNS_advance_line: {
186         line += opcode.SData;
187         break;
188       }
189       case llvm::dwarf::DW_LNS_set_file: {
190         file = opcode.Data;
191         break;
192       }
193       case llvm::dwarf::DW_LNS_negate_stmt: {
194         isStmt = !isStmt;
195         break;
196       }
197       case llvm::dwarf::DW_LNS_set_basic_block: {
198         basicBlock = true;
199         break;
200       }
201       case llvm::dwarf::DW_LNS_const_add_pc: {
202         uint8_t AdjustOpcode = 255 - table.OpcodeBase;
203         uint64_t AddrOffset =
204           (AdjustOpcode / table.LineRange) * table.MinInstLength;
205         addr += AddrOffset;
206         break;
207       }
208       case llvm::dwarf::DW_LNS_fixed_advance_pc: {
209         addr += opcode.Data;
210         break;
211       }
212       case llvm::dwarf::DW_LNS_set_isa: {
213         isa = opcode.Data;
214         break;
215       }
216       default: {
217         if (opcode.Opcode >= table.OpcodeBase) {
218           // Special opcode: adjust line and addr, using some math.
219           uint8_t AdjustOpcode =
220             opcode.Opcode - table.OpcodeBase; // 20 - 13 = 7
221           uint64_t AddrOffset = (AdjustOpcode / table.LineRange) *
222                                 table.MinInstLength; // (7 / 14) * 1 = 0
223           int32_t LineOffset =
224             table.LineBase +
225             (AdjustOpcode % table.LineRange); // -5 + (7 % 14) = 2
226           line += LineOffset;
227           addr += AddrOffset;
228           return true;
229         } else {
230           Fatal() << "unknown debug line opcode: " << std::hex << opcode.Opcode;
231         }
232       }
233     }
234     return false;
235   }
236 
237   // Checks if this starts a new range of addresses. Each range is a set of
238   // related addresses, where in particular, if the first has been zeroed out
239   // by the linker, we must omit the entire range. (If we do not, then the
240   // initial range is 0 and the others are offsets relative to it, which will
241   // look like random addresses, perhaps into the middle of instructions, and
242   // perhaps that happen to collide with real ones.)
startsNewRangewasm::Debug::LineState243   bool startsNewRange(llvm::DWARFYAML::LineTableOpcode& opcode) {
244     return opcode.Opcode == 0 &&
245            opcode.SubOpcode == llvm::dwarf::DW_LNE_set_address;
246   }
247 
needToEmitwasm::Debug::LineState248   bool needToEmit() {
249     // Zero values imply we can ignore this line.
250     // https://github.com/WebAssembly/debugging/issues/9#issuecomment-567720872
251     return line != 0 && addr != 0;
252   }
253 
254   // Given an old state, emit the diff from it to this state into a new line
255   // table entry (that will be emitted in the updated DWARF debug line section).
emitDiffwasm::Debug::LineState256   void emitDiff(const LineState& old,
257                 std::vector<llvm::DWARFYAML::LineTableOpcode>& newOpcodes,
258                 const llvm::DWARFYAML::LineTable& table,
259                 bool endSequence) const {
260     bool useSpecial = false;
261     if (addr != old.addr || line != old.line) {
262       // Try to use a special opcode TODO
263     }
264     if (addr != old.addr && !useSpecial) {
265       // len = 1 (subopcode) + 4 (wasm32 address)
266       // FIXME: look at AddrSize on the Unit.
267       auto item = makeItem(llvm::dwarf::DW_LNE_set_address, 5);
268       item.Data = addr;
269       newOpcodes.push_back(item);
270     }
271     if (line != old.line && !useSpecial) {
272       auto item = makeItem(llvm::dwarf::DW_LNS_advance_line);
273       // In wasm32 we have 32-bit addresses, and the delta here might be
274       // negative (note that SData is 64-bit, as LLVM supports 64-bit
275       // addresses too).
276       item.SData = int32_t(line - old.line);
277       newOpcodes.push_back(item);
278     }
279     if (col != old.col) {
280       auto item = makeItem(llvm::dwarf::DW_LNS_set_column);
281       item.Data = col;
282       newOpcodes.push_back(item);
283     }
284     if (file != old.file) {
285       auto item = makeItem(llvm::dwarf::DW_LNS_set_file);
286       item.Data = file;
287       newOpcodes.push_back(item);
288     }
289     if (isa != old.isa) {
290       auto item = makeItem(llvm::dwarf::DW_LNS_set_isa);
291       item.Data = isa;
292       newOpcodes.push_back(item);
293     }
294     if (discriminator != old.discriminator) {
295       // len = 1 (subopcode) + 4 (wasm32 address)
296       auto item = makeItem(llvm::dwarf::DW_LNE_set_discriminator, 5);
297       item.Data = discriminator;
298       newOpcodes.push_back(item);
299     }
300     if (isStmt != old.isStmt) {
301       newOpcodes.push_back(makeItem(llvm::dwarf::DW_LNS_negate_stmt));
302     }
303     if (basicBlock != old.basicBlock) {
304       assert(basicBlock);
305       newOpcodes.push_back(makeItem(llvm::dwarf::DW_LNS_set_basic_block));
306     }
307     if (prologueEnd) {
308       newOpcodes.push_back(makeItem(llvm::dwarf::DW_LNS_set_prologue_end));
309     }
310     if (epilogueBegin != old.epilogueBegin) {
311       Fatal() << "eb";
312     }
313     if (useSpecial) {
314       // Emit a special, which emits a line automatically.
315       // TODO
316     } else {
317       // Emit the line manually.
318       if (endSequence) {
319         // len = 1 (subopcode)
320         newOpcodes.push_back(makeItem(llvm::dwarf::DW_LNE_end_sequence, 1));
321       } else {
322         newOpcodes.push_back(makeItem(llvm::dwarf::DW_LNS_copy));
323       }
324     }
325   }
326 
327   // Some flags are automatically reset after each debug line.
resetAfterLinewasm::Debug::LineState328   void resetAfterLine() { prologueEnd = false; }
329 
330 private:
331   llvm::DWARFYAML::LineTableOpcode
makeItemwasm::Debug::LineState332   makeItem(llvm::dwarf::LineNumberOps opcode) const {
333     llvm::DWARFYAML::LineTableOpcode item = {};
334     item.Opcode = opcode;
335     return item;
336   }
337 
338   llvm::DWARFYAML::LineTableOpcode
makeItemwasm::Debug::LineState339   makeItem(llvm::dwarf::LineNumberExtendedOps opcode, uint64_t len) const {
340     auto item = makeItem(llvm::dwarf::LineNumberOps(0));
341     // All the length after the len field itself, including the subopcode
342     // (1 byte).
343     item.ExtLen = len;
344     item.SubOpcode = opcode;
345     return item;
346   }
347 };
348 
349 // Represents a mapping of addresses to expressions. We track beginnings and
350 // endings of expressions separately, since the end of one (which is one past
351 // the end in DWARF notation) overlaps with the beginning of the next, and also
352 // to let us use contextual information (we may know we are looking up the end
353 // of an instruction).
354 struct AddrExprMap {
355   std::unordered_map<BinaryLocation, Expression*> startMap;
356   std::unordered_map<BinaryLocation, Expression*> endMap;
357 
358   // Some instructions have delimiter binary locations, like the else and end in
359   // and if. Track those separately, including their expression and their id
360   // ("else", "end", etc.), as they are rare, and we don't want to
361   // bloat the common case which is represented in the earlier maps.
362   struct DelimiterInfo {
363     Expression* expr;
364     BinaryLocations::DelimiterId id;
365   };
366   std::unordered_map<BinaryLocation, DelimiterInfo> delimiterMap;
367 
368   // Construct the map from the binaryLocations loaded from the wasm.
AddrExprMapwasm::Debug::AddrExprMap369   AddrExprMap(const Module& wasm) {
370     for (auto& func : wasm.functions) {
371       for (auto pair : func->expressionLocations) {
372         add(pair.first, pair.second);
373       }
374       for (auto pair : func->delimiterLocations) {
375         add(pair.first, pair.second);
376       }
377     }
378   }
379 
getStartwasm::Debug::AddrExprMap380   Expression* getStart(BinaryLocation addr) const {
381     auto iter = startMap.find(addr);
382     if (iter != startMap.end()) {
383       return iter->second;
384     }
385     return nullptr;
386   }
387 
getEndwasm::Debug::AddrExprMap388   Expression* getEnd(BinaryLocation addr) const {
389     auto iter = endMap.find(addr);
390     if (iter != endMap.end()) {
391       return iter->second;
392     }
393     return nullptr;
394   }
395 
getDelimiterwasm::Debug::AddrExprMap396   DelimiterInfo getDelimiter(BinaryLocation addr) const {
397     auto iter = delimiterMap.find(addr);
398     if (iter != delimiterMap.end()) {
399       return iter->second;
400     }
401     return DelimiterInfo{nullptr, BinaryLocations::Invalid};
402   }
403 
404 private:
addwasm::Debug::AddrExprMap405   void add(Expression* expr, const BinaryLocations::Span span) {
406     assert(startMap.count(span.start) == 0);
407     startMap[span.start] = expr;
408     assert(endMap.count(span.end) == 0);
409     endMap[span.end] = expr;
410   }
411 
addwasm::Debug::AddrExprMap412   void add(Expression* expr,
413            const BinaryLocations::DelimiterLocations& delimiter) {
414     for (Index i = 0; i < delimiter.size(); i++) {
415       if (delimiter[i] != 0) {
416         assert(delimiterMap.count(delimiter[i]) == 0);
417         delimiterMap[delimiter[i]] =
418           DelimiterInfo{expr, BinaryLocations::DelimiterId(i)};
419       }
420     }
421   }
422 };
423 
424 // Represents a mapping of addresses to expressions. As with expressions, we
425 // track both start and end; here, however, "start" means the "start" and
426 // "declarations" fields in FunctionLocations, and "end" means the two locations
427 // of one past the end, and one before it which is the "end" opcode that is
428 // emitted.
429 struct FuncAddrMap {
430   std::unordered_map<BinaryLocation, Function*> startMap, endMap;
431 
432   // Construct the map from the binaryLocations loaded from the wasm.
FuncAddrMapwasm::Debug::FuncAddrMap433   FuncAddrMap(const Module& wasm) {
434     for (auto& func : wasm.functions) {
435       startMap[func->funcLocation.start] = func.get();
436       startMap[func->funcLocation.declarations] = func.get();
437       endMap[func->funcLocation.end - 1] = func.get();
438       endMap[func->funcLocation.end] = func.get();
439     }
440   }
441 
getStartwasm::Debug::FuncAddrMap442   Function* getStart(BinaryLocation addr) const {
443     auto iter = startMap.find(addr);
444     if (iter != startMap.end()) {
445       return iter->second;
446     }
447     return nullptr;
448   }
449 
getEndwasm::Debug::FuncAddrMap450   Function* getEnd(BinaryLocation addr) const {
451     auto iter = endMap.find(addr);
452     if (iter != endMap.end()) {
453       return iter->second;
454     }
455     return nullptr;
456   }
457 };
458 
459 // Track locations from the original binary and the new one we wrote, so that
460 // we can update debug positions.
461 // We track expressions and functions separately, instead of having a single
462 // big map of (oldAddr) => (newAddr) because of the potentially ambiguous case
463 // of the final expression in a function: it's end might be identical in offset
464 // to the end of the function. So we have two different things that map to the
465 // same offset. However, if the context is "the end of the function" then the
466 // updated address is the new end of the function, even if the function ends
467 // with a different instruction now, as the old last instruction might have
468 // moved or been optimized out.
469 struct LocationUpdater {
470   Module& wasm;
471   const BinaryLocations& newLocations;
472 
473   AddrExprMap oldExprAddrMap;
474   FuncAddrMap oldFuncAddrMap;
475 
476   // Map offsets of location list entries in the debug_loc section to the index
477   // of their compile unit.
478   std::unordered_map<BinaryLocation, size_t> locToUnitMap;
479 
480   // Map start of line tables in the debug_line section to their new locations.
481   std::unordered_map<BinaryLocation, BinaryLocation> debugLineMap;
482 
483   typedef std::pair<BinaryLocation, BinaryLocation> OldToNew;
484 
485   // Map of compile unit index => old and new base offsets (i.e., in the
486   // original binary and in the new one).
487   std::unordered_map<size_t, OldToNew> compileUnitBases;
488 
489   // TODO: for memory efficiency, we may want to do this in a streaming manner,
490   //       binary to binary, without YAML IR.
491 
LocationUpdaterwasm::Debug::LocationUpdater492   LocationUpdater(Module& wasm, const BinaryLocations& newLocations)
493     : wasm(wasm), newLocations(newLocations), oldExprAddrMap(wasm),
494       oldFuncAddrMap(wasm) {}
495 
496   // Updates an expression's address. If there was never an instruction at that
497   // address, or if there was but if that instruction no longer exists, return
498   // 0. Otherwise, return the new updated location.
getNewExprStartwasm::Debug::LocationUpdater499   BinaryLocation getNewExprStart(BinaryLocation oldAddr) const {
500     if (auto* expr = oldExprAddrMap.getStart(oldAddr)) {
501       auto iter = newLocations.expressions.find(expr);
502       if (iter != newLocations.expressions.end()) {
503         BinaryLocation newAddr = iter->second.start;
504         return newAddr;
505       }
506     }
507     return 0;
508   }
509 
hasOldExprStartwasm::Debug::LocationUpdater510   bool hasOldExprStart(BinaryLocation oldAddr) const {
511     return oldExprAddrMap.getStart(oldAddr);
512   }
513 
getNewExprEndwasm::Debug::LocationUpdater514   BinaryLocation getNewExprEnd(BinaryLocation oldAddr) const {
515     if (auto* expr = oldExprAddrMap.getEnd(oldAddr)) {
516       auto iter = newLocations.expressions.find(expr);
517       if (iter != newLocations.expressions.end()) {
518         return iter->second.end;
519       }
520     }
521     return 0;
522   }
523 
hasOldExprEndwasm::Debug::LocationUpdater524   bool hasOldExprEnd(BinaryLocation oldAddr) const {
525     return oldExprAddrMap.getEnd(oldAddr);
526   }
527 
getNewFuncStartwasm::Debug::LocationUpdater528   BinaryLocation getNewFuncStart(BinaryLocation oldAddr) const {
529     if (auto* func = oldFuncAddrMap.getStart(oldAddr)) {
530       // The function might have been optimized away, check.
531       auto iter = newLocations.functions.find(func);
532       if (iter != newLocations.functions.end()) {
533         auto oldLocations = func->funcLocation;
534         auto newLocations = iter->second;
535         if (oldAddr == oldLocations.start) {
536           return newLocations.start;
537         } else if (oldAddr == oldLocations.declarations) {
538           return newLocations.declarations;
539         } else {
540           WASM_UNREACHABLE("invalid func start");
541         }
542       }
543     }
544     return 0;
545   }
546 
hasOldFuncStartwasm::Debug::LocationUpdater547   bool hasOldFuncStart(BinaryLocation oldAddr) const {
548     return oldFuncAddrMap.getStart(oldAddr);
549   }
550 
getNewFuncEndwasm::Debug::LocationUpdater551   BinaryLocation getNewFuncEnd(BinaryLocation oldAddr) const {
552     if (auto* func = oldFuncAddrMap.getEnd(oldAddr)) {
553       // The function might have been optimized away, check.
554       auto iter = newLocations.functions.find(func);
555       if (iter != newLocations.functions.end()) {
556         auto oldLocations = func->funcLocation;
557         auto newLocations = iter->second;
558         if (oldAddr == oldLocations.end) {
559           return newLocations.end;
560         } else if (oldAddr == oldLocations.end - 1) {
561           return newLocations.end - 1;
562         } else {
563           WASM_UNREACHABLE("invalid func end");
564         }
565       }
566     }
567     return 0;
568   }
569 
570   // Check for either the end opcode, or one past the end.
hasOldFuncEndwasm::Debug::LocationUpdater571   bool hasOldFuncEnd(BinaryLocation oldAddr) const {
572     return oldFuncAddrMap.getEnd(oldAddr);
573   }
574 
575   // Check specifically for the end opcode.
hasOldFuncEndOpcodewasm::Debug::LocationUpdater576   bool hasOldFuncEndOpcode(BinaryLocation oldAddr) const {
577     if (auto* func = oldFuncAddrMap.getEnd(oldAddr)) {
578       return oldAddr == func->funcLocation.end - 1;
579     }
580     return false;
581   }
582 
getNewDelimiterwasm::Debug::LocationUpdater583   BinaryLocation getNewDelimiter(BinaryLocation oldAddr) const {
584     auto info = oldExprAddrMap.getDelimiter(oldAddr);
585     if (info.expr) {
586       auto iter = newLocations.delimiters.find(info.expr);
587       if (iter != newLocations.delimiters.end()) {
588         return iter->second[info.id];
589       }
590     }
591     return 0;
592   }
593 
hasOldDelimiterwasm::Debug::LocationUpdater594   bool hasOldDelimiter(BinaryLocation oldAddr) const {
595     return oldExprAddrMap.getDelimiter(oldAddr).expr;
596   }
597 
598   // getNewStart|EndAddr utilities.
599   // TODO: should we track the start and end of delimiters, even though they
600   //       are just one byte?
getNewStartwasm::Debug::LocationUpdater601   BinaryLocation getNewStart(BinaryLocation oldStart) const {
602     if (hasOldExprStart(oldStart)) {
603       return getNewExprStart(oldStart);
604     } else if (hasOldFuncStart(oldStart)) {
605       return getNewFuncStart(oldStart);
606     } else if (hasOldDelimiter(oldStart)) {
607       return getNewDelimiter(oldStart);
608     }
609     return 0;
610   }
611 
getNewEndwasm::Debug::LocationUpdater612   BinaryLocation getNewEnd(BinaryLocation oldEnd) const {
613     if (hasOldExprEnd(oldEnd)) {
614       return getNewExprEnd(oldEnd);
615     } else if (hasOldFuncEnd(oldEnd)) {
616       return getNewFuncEnd(oldEnd);
617     } else if (hasOldDelimiter(oldEnd)) {
618       return getNewDelimiter(oldEnd);
619     }
620     return 0;
621   }
622 
getNewDebugLineLocationwasm::Debug::LocationUpdater623   BinaryLocation getNewDebugLineLocation(BinaryLocation old) const {
624     return debugLineMap.at(old);
625   }
626 
627   // Given an offset in .debug_loc, get the old and new compile unit bases.
getCompileUnitBasesForLocwasm::Debug::LocationUpdater628   OldToNew getCompileUnitBasesForLoc(size_t offset) const {
629     auto index = locToUnitMap.at(offset);
630     auto iter = compileUnitBases.find(index);
631     if (iter != compileUnitBases.end()) {
632       return iter->second;
633     }
634     return OldToNew{0, 0};
635   }
636 };
637 
638 // Update debug lines, and update the locationUpdater with debug line offset
639 // changes so we can update offsets into the debug line section.
updateDebugLines(llvm::DWARFYAML::Data & data,LocationUpdater & locationUpdater)640 static void updateDebugLines(llvm::DWARFYAML::Data& data,
641                              LocationUpdater& locationUpdater) {
642   for (auto& table : data.DebugLines) {
643     uint32_t sequenceId = 0;
644     // Parse the original opcodes and emit new ones.
645     LineState state(table, sequenceId);
646     // All the addresses we need to write out.
647     std::vector<BinaryLocation> newAddrs;
648     std::unordered_map<BinaryLocation, LineState> newAddrInfo;
649     // If the address was zeroed out, we must omit the entire range (we could
650     // also leave it unchanged, so that the debugger ignores it based on the
651     // initial zero; but it's easier and better to just not emit it at all).
652     bool omittingRange = false;
653     for (auto& opcode : table.Opcodes) {
654       // Update the state, and check if we have a new row to emit.
655       if (state.startsNewRange(opcode)) {
656         omittingRange = false;
657       }
658       if (state.update(opcode, table)) {
659         if (state.addr == 0) {
660           omittingRange = true;
661         }
662         if (omittingRange) {
663           state = LineState(table, sequenceId);
664           continue;
665         }
666         // An expression may not exist for this line table item, if we optimized
667         // it away.
668         BinaryLocation oldAddr = state.addr;
669         BinaryLocation newAddr = 0;
670         if (locationUpdater.hasOldExprStart(oldAddr)) {
671           newAddr = locationUpdater.getNewExprStart(oldAddr);
672         }
673         // Test for a function's end address first, as LLVM output appears to
674         // use 1-past-the-end-of-the-function as a location in that function,
675         // and not the next (but the first byte of the next function, which is
676         // ambiguously identical to that value, is used at least in low_pc).
677         else if (locationUpdater.hasOldFuncEnd(oldAddr)) {
678           newAddr = locationUpdater.getNewFuncEnd(oldAddr);
679         } else if (locationUpdater.hasOldFuncStart(oldAddr)) {
680           newAddr = locationUpdater.getNewFuncStart(oldAddr);
681         } else if (locationUpdater.hasOldDelimiter(oldAddr)) {
682           newAddr = locationUpdater.getNewDelimiter(oldAddr);
683         }
684         if (newAddr && state.needToEmit()) {
685           // LLVM sometimes emits the same address more than once. We should
686           // probably investigate that.
687           if (newAddrInfo.count(newAddr)) {
688             continue;
689           }
690           newAddrs.push_back(newAddr);
691           newAddrInfo.emplace(newAddr, state);
692           auto& updatedState = newAddrInfo.at(newAddr);
693           // The only difference is the address TODO other stuff?
694           updatedState.addr = newAddr;
695           // Reset relevant state.
696           state.resetAfterLine();
697         }
698         if (opcode.Opcode == 0 &&
699             opcode.SubOpcode == llvm::dwarf::DW_LNE_end_sequence) {
700           sequenceId++;
701           // We assume the number of sequences can fit in 32 bits, and -1 is
702           // an invalid value.
703           assert(sequenceId != uint32_t(-1));
704           state = LineState(table, sequenceId);
705         }
706       }
707     }
708     // Sort the new addresses (which may be substantially different from the
709     // original layout after optimization).
710     std::sort(newAddrs.begin(), newAddrs.end());
711     // Emit a new line table.
712     {
713       std::vector<llvm::DWARFYAML::LineTableOpcode> newOpcodes;
714       for (size_t i = 0; i < newAddrs.size(); i++) {
715         LineState state = newAddrInfo.at(newAddrs[i]);
716         assert(state.needToEmit());
717         LineState lastState(table, -1);
718         if (i != 0) {
719           lastState = newAddrInfo.at(newAddrs[i - 1]);
720           // If the last line is in another sequence, clear the old state, as
721           // there is nothing to diff to.
722           if (lastState.sequenceId != state.sequenceId) {
723             lastState = LineState(table, -1);
724           }
725         }
726         // This line ends a sequence if there is no next line after it, or if
727         // the next line is in a different sequence.
728         bool endSequence =
729           i + 1 == newAddrs.size() ||
730           newAddrInfo.at(newAddrs[i + 1]).sequenceId != state.sequenceId;
731         state.emitDiff(lastState, newOpcodes, table, endSequence);
732       }
733       table.Opcodes.swap(newOpcodes);
734     }
735   }
736   // After updating the contents, run the emitter in order to update the
737   // lengths of each section. We will use that to update offsets into the
738   // debug_line section.
739   std::vector<size_t> computedLengths;
740   llvm::DWARFYAML::ComputeDebugLine(data, computedLengths);
741   BinaryLocation oldLocation = 0, newLocation = 0;
742   for (size_t i = 0; i < data.DebugLines.size(); i++) {
743     auto& table = data.DebugLines[i];
744     locationUpdater.debugLineMap[oldLocation] = newLocation;
745     oldLocation += table.Length.getLength() + AddressSize;
746     newLocation += computedLengths[i] + AddressSize;
747     table.Length.setLength(computedLengths[i]);
748   }
749 }
750 
751 // Iterate in parallel over a DwarfContext representation element and a
752 // YAML element, which parallel each other.
753 template<typename T, typename U, typename W>
iterContextAndYAML(const T & contextList,U & yamlList,W func)754 static void iterContextAndYAML(const T& contextList, U& yamlList, W func) {
755   auto yamlValue = yamlList.begin();
756   for (const auto& contextValue : contextList) {
757     assert(yamlValue != yamlList.end());
758     func(contextValue, *yamlValue);
759     yamlValue++;
760   }
761   assert(yamlValue == yamlList.end());
762 }
763 
764 // Updates a YAML entry from a DWARF DIE. Also updates LocationUpdater
765 // associating each .debug_loc entry with the base address of its corresponding
766 // compilation unit.
updateDIE(const llvm::DWARFDebugInfoEntry & DIE,llvm::DWARFYAML::Entry & yamlEntry,const llvm::DWARFAbbreviationDeclaration * abbrevDecl,LocationUpdater & locationUpdater,size_t compileUnitIndex)767 static void updateDIE(const llvm::DWARFDebugInfoEntry& DIE,
768                       llvm::DWARFYAML::Entry& yamlEntry,
769                       const llvm::DWARFAbbreviationDeclaration* abbrevDecl,
770                       LocationUpdater& locationUpdater,
771                       size_t compileUnitIndex) {
772   auto tag = DIE.getTag();
773   // Pairs of low/high_pc require some special handling, as the high
774   // may be an offset relative to the low. First, process everything but
775   // the high pcs, so we see the low pcs first.
776   BinaryLocation oldLowPC = 0, newLowPC = 0;
777   iterContextAndYAML(
778     abbrevDecl->attributes(),
779     yamlEntry.Values,
780     [&](const llvm::DWARFAbbreviationDeclaration::AttributeSpec& attrSpec,
781         llvm::DWARFYAML::FormValue& yamlValue) {
782       auto attr = attrSpec.Attr;
783       if (attr == llvm::dwarf::DW_AT_low_pc) {
784         // This is an address.
785         BinaryLocation oldValue = yamlValue.Value, newValue = 0;
786         if (tag == llvm::dwarf::DW_TAG_GNU_call_site ||
787             tag == llvm::dwarf::DW_TAG_inlined_subroutine ||
788             tag == llvm::dwarf::DW_TAG_lexical_block ||
789             tag == llvm::dwarf::DW_TAG_label) {
790           newValue = locationUpdater.getNewStart(oldValue);
791         } else if (tag == llvm::dwarf::DW_TAG_compile_unit) {
792           newValue = locationUpdater.getNewFuncStart(oldValue);
793           // Per the DWARF spec, "The base address of a compile unit is
794           // defined as the value of the DW_AT_low_pc attribute, if present."
795           locationUpdater.compileUnitBases[compileUnitIndex] =
796             LocationUpdater::OldToNew{oldValue, newValue};
797         } else if (tag == llvm::dwarf::DW_TAG_subprogram) {
798           newValue = locationUpdater.getNewFuncStart(oldValue);
799         } else {
800           Fatal() << "unknown tag with low_pc "
801                   << llvm::dwarf::TagString(tag).str();
802         }
803         oldLowPC = oldValue;
804         newLowPC = newValue;
805         yamlValue.Value = newValue;
806       } else if (attr == llvm::dwarf::DW_AT_stmt_list) {
807         // This is an offset into the debug line section.
808         yamlValue.Value =
809           locationUpdater.getNewDebugLineLocation(yamlValue.Value);
810       } else if (attr == llvm::dwarf::DW_AT_location &&
811                  attrSpec.Form == llvm::dwarf::DW_FORM_sec_offset) {
812         BinaryLocation locOffset = yamlValue.Value;
813         locationUpdater.locToUnitMap[locOffset] = compileUnitIndex;
814       }
815     });
816   // Next, process the high_pcs.
817   // TODO: do this more efficiently, without a second traversal (but that's a
818   //       little tricky given the special double-traversal we have).
819   iterContextAndYAML(
820     abbrevDecl->attributes(),
821     yamlEntry.Values,
822     [&](const llvm::DWARFAbbreviationDeclaration::AttributeSpec& attrSpec,
823         llvm::DWARFYAML::FormValue& yamlValue) {
824       auto attr = attrSpec.Attr;
825       if (attr != llvm::dwarf::DW_AT_high_pc) {
826         return;
827       }
828       BinaryLocation oldValue = yamlValue.Value, newValue = 0;
829       bool isRelative = attrSpec.Form == llvm::dwarf::DW_FORM_data4;
830       if (isRelative) {
831         oldValue += oldLowPC;
832       }
833       if (tag == llvm::dwarf::DW_TAG_GNU_call_site ||
834           tag == llvm::dwarf::DW_TAG_inlined_subroutine ||
835           tag == llvm::dwarf::DW_TAG_lexical_block ||
836           tag == llvm::dwarf::DW_TAG_label) {
837         newValue = locationUpdater.getNewExprEnd(oldValue);
838       } else if (tag == llvm::dwarf::DW_TAG_compile_unit ||
839                  tag == llvm::dwarf::DW_TAG_subprogram) {
840         newValue = locationUpdater.getNewFuncEnd(oldValue);
841       } else {
842         Fatal() << "unknown tag with low_pc "
843                 << llvm::dwarf::TagString(tag).str();
844       }
845       if (isRelative) {
846         newValue -= newLowPC;
847       }
848       yamlValue.Value = newValue;
849     });
850 }
851 
updateCompileUnits(const BinaryenDWARFInfo & info,llvm::DWARFYAML::Data & yaml,LocationUpdater & locationUpdater)852 static void updateCompileUnits(const BinaryenDWARFInfo& info,
853                                llvm::DWARFYAML::Data& yaml,
854                                LocationUpdater& locationUpdater) {
855   // The context has the high-level information we need, and the YAML is where
856   // we write changes. First, iterate over the compile units.
857   size_t compileUnitIndex = 0;
858   iterContextAndYAML(
859     info.context->compile_units(),
860     yaml.CompileUnits,
861     [&](const std::unique_ptr<llvm::DWARFUnit>& CU,
862         llvm::DWARFYAML::Unit& yamlUnit) {
863       // Process the DIEs in each compile unit.
864       iterContextAndYAML(
865         CU->dies(),
866         yamlUnit.Entries,
867         [&](const llvm::DWARFDebugInfoEntry& DIE,
868             llvm::DWARFYAML::Entry& yamlEntry) {
869           // Process the entries in each relevant DIE, looking for attributes to
870           // change.
871           auto abbrevDecl = DIE.getAbbreviationDeclarationPtr();
872           if (abbrevDecl) {
873             // This is relevant; look for things to update.
874             updateDIE(
875               DIE, yamlEntry, abbrevDecl, locationUpdater, compileUnitIndex);
876           }
877         });
878       compileUnitIndex++;
879     });
880 }
881 
updateRanges(llvm::DWARFYAML::Data & yaml,const LocationUpdater & locationUpdater)882 static void updateRanges(llvm::DWARFYAML::Data& yaml,
883                          const LocationUpdater& locationUpdater) {
884   // In each range section, try to update the start and end. If we no longer
885   // have something to map them to, we must skip that part.
886   size_t skip = 0;
887   for (size_t i = 0; i < yaml.Ranges.size(); i++) {
888     auto& range = yaml.Ranges[i];
889     BinaryLocation oldStart = range.Start, oldEnd = range.End, newStart = 0,
890                    newEnd = 0;
891     // If this is an end marker (0, 0), or an invalid range (0, x) or (x, 0)
892     // then just emit it as it is - either to mark the end, or to mark an
893     // invalid entry.
894     if (oldStart == 0 || oldEnd == 0) {
895       newStart = oldStart;
896       newEnd = oldEnd;
897     } else {
898       // This was a valid entry; update it.
899       newStart = locationUpdater.getNewStart(oldStart);
900       newEnd = locationUpdater.getNewEnd(oldEnd);
901       if (newStart == 0 || newEnd == 0) {
902         // This part of the range no longer has a mapping, so we must skip it.
903         // Don't use (0, 0) as that would be an end marker; emit something
904         // invalid for the debugger to ignore.
905         newStart = 0;
906         newEnd = 1;
907       }
908       // TODO even if range start and end markers have been preserved,
909       // instructions in the middle may have moved around, making the range no
910       // longer contiguous. We should check that, and possibly split/merge
911       // the range. Or, we may need to have tracking in the IR for this.
912     }
913     auto& writtenRange = yaml.Ranges[i - skip];
914     writtenRange.Start = newStart;
915     writtenRange.End = newEnd;
916   }
917 }
918 
919 // A location that is ignoreable, i.e., not a special value like 0 or -1 (which
920 // would indicate an end or a base in .debug_loc).
921 static const BinaryLocation IGNOREABLE_LOCATION = 1;
922 
isNewBaseLoc(const llvm::DWARFYAML::Loc & loc)923 static bool isNewBaseLoc(const llvm::DWARFYAML::Loc& loc) {
924   return loc.Start == BinaryLocation(-1);
925 }
926 
isEndMarkerLoc(const llvm::DWARFYAML::Loc & loc)927 static bool isEndMarkerLoc(const llvm::DWARFYAML::Loc& loc) {
928   return loc.Start == 0 && loc.End == 0;
929 }
930 
931 // Update the .debug_loc section.
updateLoc(llvm::DWARFYAML::Data & yaml,const LocationUpdater & locationUpdater)932 static void updateLoc(llvm::DWARFYAML::Data& yaml,
933                       const LocationUpdater& locationUpdater) {
934   // Similar to ranges, try to update the start and end. Note that here we
935   // can't skip since the location description is a variable number of bytes,
936   // so we mark no longer valid addresses as empty.
937   bool atStart = true;
938   // We need to keep positions in the .debug_loc section identical to before
939   // (or else we'd need to update their positions too) and so we need to keep
940   // base entries around (a base entry is added to every entry after it in the
941   // list). However, we may change the base's value as after moving instructions
942   // around the old base may not be smaller than all the values relative to it.
943   BinaryLocation oldBase, newBase;
944   auto& locs = yaml.Locs;
945   for (size_t i = 0; i < locs.size(); i++) {
946     auto& loc = locs[i];
947     if (atStart) {
948       std::tie(oldBase, newBase) =
949         locationUpdater.getCompileUnitBasesForLoc(loc.CompileUnitOffset);
950       atStart = false;
951     }
952     // By default we copy values over, unless we modify them below.
953     BinaryLocation newStart = loc.Start, newEnd = loc.End;
954     if (isNewBaseLoc(loc)) {
955       // This is a new base.
956       // Note that the base is not the address of an instruction, necessarily -
957       // it's just a number (seems like it could always be an instruction, but
958       // that's not what LLVM emits).
959       // We must look forward at everything relative to this base, so that we
960       // can emit a new proper base (as mentioned earlier, the original base may
961       // not be valid if instructions moved to a position before it - they must
962       // be positive offsets from it).
963       oldBase = newBase = newEnd;
964       BinaryLocation smallest = -1;
965       for (size_t j = i + 1; j < locs.size(); j++) {
966         auto& futureLoc = locs[j];
967         if (isNewBaseLoc(futureLoc) || isEndMarkerLoc(futureLoc)) {
968           break;
969         }
970         auto updatedStart =
971           locationUpdater.getNewStart(futureLoc.Start + oldBase);
972         // If we found a valid mapping, this is a relevant value for us. If the
973         // optimizer removed it, it's a 0, and we can ignore it here - we will
974         // emit IGNOREABLE_LOCATION for it later anyhow.
975         if (updatedStart != 0) {
976           smallest = std::min(smallest, updatedStart);
977         }
978       }
979       // If we found no valid values that will be relativized here, just use 0
980       // as the new (never-to-be-used) base, which is less confusing (otherwise
981       // the value looks like it means something).
982       if (smallest == BinaryLocation(-1)) {
983         smallest = 0;
984       }
985       newBase = newEnd = smallest;
986     } else if (isEndMarkerLoc(loc)) {
987       // This is an end marker, this list is done; reset the base.
988       atStart = true;
989     } else {
990       // This is a normal entry, try to find what it should be updated to. First
991       // de-relativize it to the base to get the absolute address, then look for
992       // a new address for it.
993       newStart = locationUpdater.getNewStart(loc.Start + oldBase);
994       newEnd = locationUpdater.getNewEnd(loc.End + oldBase);
995       if (newStart == 0 || newEnd == 0 || newStart > newEnd) {
996         // This part of the loc no longer has a mapping, or after the mapping
997         // it is no longer a proper span, so we must ignore it.
998         newStart = newEnd = IGNOREABLE_LOCATION;
999       } else {
1000         // We picked a new base that ensures it is smaller than the values we
1001         // will relativize to it.
1002         assert(newStart >= newBase && newEnd >= newBase);
1003         newStart -= newBase;
1004         newEnd -= newBase;
1005         if (newStart == 0 && newEnd == 0) {
1006           // After mapping to the new positions, and after relativizing to the
1007           // base, if we end up with (0, 0) then we must emit something else, as
1008           // that would be interpreted as the end of a list. As it is an empty
1009           // span, the actual value doesn't matter, it just has to be != 0.
1010           // This can happen if the very first span in a compile unit is an
1011           // empty span, in which case relative to the base of the compile unit
1012           // we would have (0, 0).
1013           newStart = newEnd = IGNOREABLE_LOCATION;
1014         }
1015       }
1016       // The loc start and end markers have been preserved. However, TODO
1017       // instructions in the middle may have moved around, making the loc no
1018       // longer contiguous, we should check that, and possibly split/merge
1019       // the loc. Or, we may need to have tracking in the IR for this.
1020     }
1021     loc.Start = newStart;
1022     loc.End = newEnd;
1023     // Note how the ".Location" field is unchanged.
1024   }
1025 }
1026 
writeDWARFSections(Module & wasm,const BinaryLocations & newLocations)1027 void writeDWARFSections(Module& wasm, const BinaryLocations& newLocations) {
1028   BinaryenDWARFInfo info(wasm);
1029 
1030   // Convert to Data representation, which YAML can use to write.
1031   llvm::DWARFYAML::Data data;
1032   if (dwarf2yaml(*info.context, data)) {
1033     Fatal() << "Failed to parse DWARF to YAML";
1034   }
1035 
1036   LocationUpdater locationUpdater(wasm, newLocations);
1037 
1038   updateDebugLines(data, locationUpdater);
1039 
1040   updateCompileUnits(info, data, locationUpdater);
1041 
1042   updateRanges(data, locationUpdater);
1043 
1044   updateLoc(data, locationUpdater);
1045 
1046   // Convert to binary sections.
1047   auto newSections =
1048     EmitDebugSections(data, false /* EmitFixups for debug_info */);
1049 
1050   // Update the custom sections in the wasm.
1051   // TODO: efficiency
1052   for (auto& section : wasm.userSections) {
1053     if (Name(section.name).startsWith(".debug_")) {
1054       auto llvmName = section.name.substr(1);
1055       if (newSections.count(llvmName)) {
1056         auto llvmData = newSections[llvmName]->getBuffer();
1057         section.data.resize(llvmData.size());
1058         std::copy(llvmData.begin(), llvmData.end(), section.data.data());
1059       }
1060     }
1061   }
1062 }
1063 
1064 #else // BUILD_LLVM_DWARF
1065 
dumpDWARF(const Module & wasm)1066 void dumpDWARF(const Module& wasm) {
1067   std::cerr << "warning: no DWARF dumping support present\n";
1068 }
1069 
writeDWARFSections(Module & wasm,const BinaryLocations & newLocations)1070 void writeDWARFSections(Module& wasm, const BinaryLocations& newLocations) {
1071   std::cerr << "warning: no DWARF updating support present\n";
1072 }
1073 
1074 #endif // BUILD_LLVM_DWARF
1075 
1076 } // namespace Debug
1077 
1078 } // namespace wasm
1079