1#!/usr/bin/env python
2# Copyright 2016 the V8 project authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to transform and merge sancov files into human readable json-format.
7
8The script supports three actions:
9all: Writes a json file with all instrumented lines of all executables.
10merge: Merges sancov files with coverage output into an existing json file.
11split: Split json file into separate files per covered source file.
12
13The json data is structured as follows:
14{
15  "version": 1,
16  "tests": ["executable1", "executable2", ...],
17  "files": {
18    "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...],
19    "file2": [...],
20    ...
21  }
22}
23
24The executables are sorted and determine the test bit mask. Their index+1 is
25the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by
26executable1 and executable3 will have bit_mask == 5 == 0b101. The number of
27tests is restricted to 52 in version 1, to allow javascript JSON parsing of
28the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
29
30The line-number-bit_mask pairs are sorted by line number and don't contain
31duplicates.
32
33Split json data preserves the same format, but only contains one file per
34json file.
35
36The sancov tool is expected to be in the llvm compiler-rt third-party
37directory. It's not checked out by default and must be added as a custom deps:
38'v8/third_party/llvm/projects/compiler-rt':
39    'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git'
40"""
41
42# for py2/py3 compatibility
43from __future__ import print_function
44from functools import reduce
45
46import argparse
47import json
48import logging
49import os
50import re
51import subprocess
52import sys
53
54from multiprocessing import Pool, cpu_count
55
56
57logging.basicConfig(level=logging.INFO)
58
59# Files to exclude from coverage. Dropping their data early adds more speed.
60# The contained cc files are already excluded from instrumentation, but inlined
61# data is referenced through v8's object files.
62EXCLUSIONS = [
63  'buildtools',
64  'src/third_party',
65  'third_party',
66  'test',
67  'testing',
68]
69
70# Executables found in the build output for which no coverage is generated.
71# Exclude them from the coverage data file.
72EXE_BLACKLIST = [
73  'generate-bytecode-expectations',
74  'hello-world',
75  'mksnapshot',
76  'parser-shell',
77  'process',
78  'shell',
79]
80
81# V8 checkout directory.
82BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(
83    os.path.abspath(__file__))))
84
85# The sancov tool location.
86SANCOV_TOOL = os.path.join(
87    BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt',
88    'lib', 'sanitizer_common', 'scripts', 'sancov.py')
89
90# Simple script to sanitize the PCs from objdump.
91SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py')
92
93# The llvm symbolizer location.
94SYMBOLIZER = os.path.join(
95    BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin',
96    'llvm-symbolizer')
97
98# Number of cpus.
99CPUS = cpu_count()
100
101# Regexp to find sancov files as output by sancov_merger.py. Also grabs the
102# executable name in group 1.
103SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$')
104
105
106def executables(build_dir):
107  """Iterates over executable files in the build directory."""
108  for f in os.listdir(build_dir):
109    file_path = os.path.join(build_dir, f)
110    if (os.path.isfile(file_path) and
111        os.access(file_path, os.X_OK) and
112        f not in EXE_BLACKLIST):
113      yield file_path
114
115
116def process_symbolizer_output(output, build_dir):
117  """Post-process llvm symbolizer output.
118
119  Excludes files outside the v8 checkout or given in exclusion list above
120  from further processing. Drops the character index in each line.
121
122  Returns: A mapping of file names to lists of line numbers. The file names
123           have relative paths to the v8 base directory. The lists of line
124           numbers don't contain duplicate lines and are sorted.
125  """
126  # Path prefix added by the llvm symbolizer including trailing slash.
127  output_path_prefix = os.path.join(build_dir, '..', '..', '')
128
129  # Drop path prefix when iterating lines. The path is redundant and takes
130  # too much space. Drop files outside that path, e.g. generated files in
131  # the build dir and absolute paths to c++ library headers.
132  def iter_lines():
133    for line in output.strip().splitlines():
134      if line.startswith(output_path_prefix):
135        yield line[len(output_path_prefix):]
136
137  # Map file names to sets of instrumented line numbers.
138  file_map = {}
139  for line in iter_lines():
140    # Drop character number, we only care for line numbers. Each line has the
141    # form: <file name>:<line number>:<character number>.
142    file_name, number, _ = line.split(':')
143    file_map.setdefault(file_name, set([])).add(int(number))
144
145  # Remove exclusion patterns from file map. It's cheaper to do it after the
146  # mapping, as there are few excluded files and we don't want to do this
147  # check for numerous lines in ordinary files.
148  def keep(file_name):
149    for e in EXCLUSIONS:
150      if file_name.startswith(e):
151        return False
152    return True
153
154  # Return in serializable form and filter.
155  return {k: sorted(file_map[k]) for k in file_map if keep(k)}
156
157
158def get_instrumented_lines(executable):
159  """Return the instrumented lines of an executable.
160
161  Called trough multiprocessing pool.
162
163  Returns: Post-processed llvm output as returned by process_symbolizer_output.
164  """
165  # The first two pipes are from llvm's tool sancov.py with 0x added to the hex
166  # numbers. The results are piped into the llvm symbolizer, which outputs for
167  # each PC: <file name with abs path>:<line number>:<character number>.
168  # We don't call the sancov tool to get more speed.
169  process = subprocess.Popen(
170      'objdump -d %s | '
171      'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ '
172      '<__sanitizer_cov\(_with_check\|\|_trace_pc_guard\)\(@plt\|\)>\' | '
173      'grep \'^\s\+[0-9a-f]\+\' -o | '
174      '%s | '
175      '%s --obj %s -functions=none' %
176          (executable, SANITIZE_PCS, SYMBOLIZER, executable),
177      stdout=subprocess.PIPE,
178      stderr=subprocess.PIPE,
179      stdin=subprocess.PIPE,
180      cwd=BASE_DIR,
181      shell=True,
182  )
183  output, _ = process.communicate()
184  assert process.returncode == 0
185  return process_symbolizer_output(output, os.path.dirname(executable))
186
187
188def merge_instrumented_line_results(exe_list, results):
189  """Merge multiprocessing results for all instrumented lines.
190
191  Args:
192    exe_list: List of all executable names with absolute paths.
193    results: List of results as returned by get_instrumented_lines.
194
195  Returns: Dict to be used as json data as specified on the top of this page.
196           The dictionary contains all instrumented lines of all files
197           referenced by all executables.
198  """
199  def merge_files(x, y):
200    for file_name, lines in y.iteritems():
201      x.setdefault(file_name, set([])).update(lines)
202    return x
203  result = reduce(merge_files, results, {})
204
205  # Return data as file->lines mapping. The lines are saved as lists
206  # with (line number, test bits (as int)). The test bits are initialized with
207  # 0, meaning instrumented, but no coverage.
208  # The order of the test bits is given with key 'tests'. For now, these are
209  # the executable names. We use a _list_ with two items instead of a tuple to
210  # ease merging by allowing mutation of the second item.
211  return {
212    'version': 1,
213    'tests': sorted(map(os.path.basename, exe_list)),
214    'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result},
215  }
216
217
218def write_instrumented(options):
219  """Implements the 'all' action of this tool."""
220  exe_list = list(executables(options.build_dir))
221  logging.info('Reading instrumented lines from %d executables.',
222               len(exe_list))
223  pool = Pool(CPUS)
224  try:
225    results = pool.imap_unordered(get_instrumented_lines, exe_list)
226  finally:
227    pool.close()
228
229  # Merge multiprocessing results and prepare output data.
230  data = merge_instrumented_line_results(exe_list, results)
231
232  logging.info('Read data from %d executables, which covers %d files.',
233               len(data['tests']), len(data['files']))
234  logging.info('Writing results to %s', options.json_output)
235
236  # Write json output.
237  with open(options.json_output, 'w') as f:
238    json.dump(data, f, sort_keys=True)
239
240
241def get_covered_lines(args):
242  """Return the covered lines of an executable.
243
244  Called trough multiprocessing pool. The args are expected to unpack to:
245    cov_dir: Folder with sancov files merged by sancov_merger.py.
246    executable: Absolute path to the executable that was called to produce the
247                given coverage data.
248    sancov_file: The merged sancov file with coverage data.
249
250  Returns: A tuple of post-processed llvm output as returned by
251           process_symbolizer_output and the executable name.
252  """
253  cov_dir, executable, sancov_file = args
254
255  # Let the sancov tool print the covered PCs and pipe them through the llvm
256  # symbolizer.
257  process = subprocess.Popen(
258      '%s print %s 2> /dev/null | '
259      '%s --obj %s -functions=none' %
260          (SANCOV_TOOL,
261           os.path.join(cov_dir, sancov_file),
262           SYMBOLIZER,
263           executable),
264      stdout=subprocess.PIPE,
265      stderr=subprocess.PIPE,
266      stdin=subprocess.PIPE,
267      cwd=BASE_DIR,
268      shell=True,
269  )
270  output, _ = process.communicate()
271  assert process.returncode == 0
272  return (
273      process_symbolizer_output(output, os.path.dirname(executable)),
274      os.path.basename(executable),
275  )
276
277
278def merge_covered_line_results(data, results):
279  """Merge multiprocessing results for covered lines.
280
281  The data is mutated, the results are merged into it in place.
282
283  Args:
284    data: Existing coverage data from json file containing all instrumented
285          lines.
286    results: List of results as returned by get_covered_lines.
287  """
288
289  # List of executables and mapping to the test bit mask. The number of
290  # tests is restricted to 52, to allow javascript JSON parsing of
291  # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
292  exe_list = data['tests']
293  assert len(exe_list) <= 52, 'Max 52 different tests are supported.'
294  test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)}
295
296  def merge_lines(old_lines, new_lines, mask):
297    """Merge the coverage data of a list of lines.
298
299    Args:
300      old_lines: Lines as list of pairs with line number and test bit mask.
301                 The new lines will be merged into the list in place.
302      new_lines: List of new (covered) lines (sorted).
303      mask: The bit to be set for covered lines. The bit index is the test
304            index of the executable that covered the line.
305    """
306    i = 0
307    # Iterate over old and new lines, both are sorted.
308    for l in new_lines:
309      while old_lines[i][0] < l:
310        # Forward instrumented lines not present in this coverage data.
311        i += 1
312        # TODO: Add more context to the assert message.
313        assert i < len(old_lines), 'Covered line %d not in input file.' % l
314      assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l
315
316      # Add coverage information to the line.
317      old_lines[i][1] |= mask
318
319  def merge_files(data, result):
320    """Merge result into data.
321
322    The data is mutated in place.
323
324    Args:
325      data: Merged coverage data from the previous reduce step.
326      result: New result to be merged in. The type is as returned by
327              get_covered_lines.
328    """
329    file_map, executable = result
330    files = data['files']
331    for file_name, lines in file_map.iteritems():
332      merge_lines(files[file_name], lines, test_bit_masks[executable])
333    return data
334
335  reduce(merge_files, results, data)
336
337
338def merge(options):
339  """Implements the 'merge' action of this tool."""
340
341  # Check if folder with coverage output exists.
342  assert (os.path.exists(options.coverage_dir) and
343          os.path.isdir(options.coverage_dir))
344
345  # Inputs for multiprocessing. List of tuples of:
346  # Coverage dir, absoluate path to executable, sancov file name.
347  inputs = []
348  for sancov_file in os.listdir(options.coverage_dir):
349    match = SANCOV_FILE_RE.match(sancov_file)
350    if match:
351      inputs.append((
352          options.coverage_dir,
353          os.path.join(options.build_dir, match.group(1)),
354          sancov_file,
355      ))
356
357  logging.info('Merging %d sancov files into %s',
358               len(inputs), options.json_input)
359
360  # Post-process covered lines in parallel.
361  pool = Pool(CPUS)
362  try:
363    results = pool.imap_unordered(get_covered_lines, inputs)
364  finally:
365    pool.close()
366
367  # Load existing json data file for merging the results.
368  with open(options.json_input, 'r') as f:
369    data = json.load(f)
370
371  # Merge muliprocessing results. Mutates data.
372  merge_covered_line_results(data, results)
373
374  logging.info('Merged data from %d executables, which covers %d files.',
375               len(data['tests']), len(data['files']))
376  logging.info('Writing results to %s', options.json_output)
377
378  # Write merged results to file.
379  with open(options.json_output, 'w') as f:
380    json.dump(data, f, sort_keys=True)
381
382
383def split(options):
384  """Implements the 'split' action of this tool."""
385  # Load existing json data file for splitting.
386  with open(options.json_input, 'r') as f:
387    data = json.load(f)
388
389  logging.info('Splitting off %d coverage files from %s',
390               len(data['files']), options.json_input)
391
392  for file_name, coverage in data['files'].iteritems():
393    # Preserve relative directories that are part of the file name.
394    file_path = os.path.join(options.output_dir, file_name + '.json')
395    try:
396      os.makedirs(os.path.dirname(file_path))
397    except OSError:
398      # Ignore existing directories.
399      pass
400
401    with open(file_path, 'w') as f:
402      # Flat-copy the old dict.
403      new_data = dict(data)
404
405      # Update current file.
406      new_data['files'] = {file_name: coverage}
407
408      # Write json data.
409      json.dump(new_data, f, sort_keys=True)
410
411
412def main(args=None):
413  parser = argparse.ArgumentParser()
414  # TODO(machenbach): Make this required and deprecate the default.
415  parser.add_argument('--build-dir',
416                      default=os.path.join(BASE_DIR, 'out', 'Release'),
417                      help='Path to the build output directory.')
418  parser.add_argument('--coverage-dir',
419                      help='Path to the sancov output files.')
420  parser.add_argument('--json-input',
421                      help='Path to an existing json file with coverage data.')
422  parser.add_argument('--json-output',
423                      help='Path to a file to write json output to.')
424  parser.add_argument('--output-dir',
425                      help='Directory where to put split output files to.')
426  parser.add_argument('action', choices=['all', 'merge', 'split'],
427                      help='Action to perform.')
428
429  options = parser.parse_args(args)
430  options.build_dir = os.path.abspath(options.build_dir)
431  if options.action.lower() == 'all':
432    if not options.json_output:
433      print('--json-output is required')
434      return 1
435    write_instrumented(options)
436  elif options.action.lower() == 'merge':
437    if not options.coverage_dir:
438      print('--coverage-dir is required')
439      return 1
440    if not options.json_input:
441      print('--json-input is required')
442      return 1
443    if not options.json_output:
444      print('--json-output is required')
445      return 1
446    merge(options)
447  elif options.action.lower() == 'split':
448    if not options.json_input:
449      print('--json-input is required')
450      return 1
451    if not options.output_dir:
452      print('--output-dir is required')
453      return 1
454    split(options)
455  return 0
456
457
458if __name__ == '__main__':
459  sys.exit(main())
460