1#!/usr/bin/env python3
2#
3#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===#
4#
5# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6# See https://llvm.org/LICENSE.txt for license information.
7# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8#
9#===------------------------------------------------------------------------===#
10
11r"""
12clang-format git integration
13============================
14
15This file provides a clang-format integration for git. Put it somewhere in your
16path and ensure that it is executable. Then, "git clang-format" will invoke
17clang-format on the changes in current files or a specific commit.
18
19For further details, run:
20git clang-format -h
21
22Requires Python 2.7 or Python 3
23"""
24
25from __future__ import absolute_import, division, print_function
26import argparse
27import collections
28import contextlib
29import errno
30import os
31import re
32import subprocess
33import sys
34
35usage = ('git clang-format [OPTIONS] [<commit>] [<commit>|--staged] '
36         '[--] [<file>...]')
37
38desc = '''
39If zero or one commits are given, run clang-format on all lines that differ
40between the working directory and <commit>, which defaults to HEAD.  Changes are
41only applied to the working directory, or in the stage/index.
42
43Examples:
44  To format staged changes, i.e everything that's been `git add`ed:
45    git clang-format
46
47  To also format everything touched in the most recent commit:
48    git clang-format HEAD~1
49
50  If you're on a branch off main, to format everything touched on your branch:
51    git clang-format main
52
53If two commits are given (requires --diff), run clang-format on all lines in the
54second <commit> that differ from the first <commit>.
55
56The following git-config settings set the default of the corresponding option:
57  clangFormat.binary
58  clangFormat.commit
59  clangFormat.extensions
60  clangFormat.style
61'''
62
63# Name of the temporary index file in which save the output of clang-format.
64# This file is created within the .git directory.
65temp_index_basename = 'clang-format-index'
66
67
68Range = collections.namedtuple('Range', 'start, count')
69
70
71def main():
72  config = load_git_config()
73
74  # In order to keep '--' yet allow options after positionals, we need to
75  # check for '--' ourselves.  (Setting nargs='*' throws away the '--', while
76  # nargs=argparse.REMAINDER disallows options after positionals.)
77  argv = sys.argv[1:]
78  try:
79    idx = argv.index('--')
80  except ValueError:
81    dash_dash = []
82  else:
83    dash_dash = argv[idx:]
84    argv = argv[:idx]
85
86  default_extensions = ','.join([
87      # From clang/lib/Frontend/FrontendOptions.cpp, all lower case
88      'c', 'h',  # C
89      'm',  # ObjC
90      'mm',  # ObjC++
91      'cc', 'cp', 'cpp', 'c++', 'cxx', 'hh', 'hpp', 'hxx', 'inc',  # C++
92      'ccm', 'cppm', 'cxxm', 'c++m',  # C++ Modules
93      'cu', 'cuh',  # CUDA
94      # Other languages that clang-format supports
95      'proto', 'protodevel',  # Protocol Buffers
96      'java',  # Java
97      'js',  # JavaScript
98      'ts',  # TypeScript
99      'cs',  # C Sharp
100      'json',  # Json
101      ])
102
103  p = argparse.ArgumentParser(
104    usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter,
105    description=desc)
106  p.add_argument('--binary',
107                 default=config.get('clangformat.binary', 'clang-format'),
108                 help='path to clang-format'),
109  p.add_argument('--commit',
110                 default=config.get('clangformat.commit', 'HEAD'),
111                 help='default commit to use if none is specified'),
112  p.add_argument('--diff', action='store_true',
113                 help='print a diff instead of applying the changes')
114  p.add_argument('--diffstat', action='store_true',
115                 help='print a diffstat instead of applying the changes')
116  p.add_argument('--extensions',
117                 default=config.get('clangformat.extensions',
118                                    default_extensions),
119                 help=('comma-separated list of file extensions to format, '
120                       'excluding the period and case-insensitive')),
121  p.add_argument('-f', '--force', action='store_true',
122                 help='allow changes to unstaged files')
123  p.add_argument('-p', '--patch', action='store_true',
124                 help='select hunks interactively')
125  p.add_argument('-q', '--quiet', action='count', default=0,
126                 help='print less information')
127  p.add_argument('--staged', '--cached', action='store_true',
128                 help='format lines in the stage instead of the working dir')
129  p.add_argument('--style',
130                 default=config.get('clangformat.style', None),
131                 help='passed to clang-format'),
132  p.add_argument('-v', '--verbose', action='count', default=0,
133                 help='print extra information')
134  # We gather all the remaining positional arguments into 'args' since we need
135  # to use some heuristics to determine whether or not <commit> was present.
136  # However, to print pretty messages, we make use of metavar and help.
137  p.add_argument('args', nargs='*', metavar='<commit>',
138                 help='revision from which to compute the diff')
139  p.add_argument('ignored', nargs='*', metavar='<file>...',
140                 help='if specified, only consider differences in these files')
141  opts = p.parse_args(argv)
142
143  opts.verbose -= opts.quiet
144  del opts.quiet
145
146  commits, files = interpret_args(opts.args, dash_dash, opts.commit)
147  if len(commits) > 1:
148    if opts.staged:
149      die('--staged is not allowed when two commits are given')
150    if not opts.diff:
151      die('--diff is required when two commits are given')
152  else:
153    if len(commits) > 2:
154      die('at most two commits allowed; %d given' % len(commits))
155  changed_lines = compute_diff_and_extract_lines(commits, files, opts.staged)
156  if opts.verbose >= 1:
157    ignored_files = set(changed_lines)
158  filter_by_extension(changed_lines, opts.extensions.lower().split(','))
159  # The computed diff outputs absolute paths, so we must cd before accessing
160  # those files.
161  cd_to_toplevel()
162  filter_symlinks(changed_lines)
163  if opts.verbose >= 1:
164    ignored_files.difference_update(changed_lines)
165    if ignored_files:
166      print(
167        'Ignoring changes in the following files (wrong extension or symlink):')
168      for filename in ignored_files:
169        print('    %s' % filename)
170    if changed_lines:
171      print('Running clang-format on the following files:')
172      for filename in changed_lines:
173        print('    %s' % filename)
174
175  if not changed_lines:
176    if opts.verbose >= 0:
177      print('no modified files to format')
178    return 0
179
180  if len(commits) > 1:
181    old_tree = commits[1]
182    revision = old_tree
183  elif opts.staged:
184    old_tree = create_tree_from_index(changed_lines)
185    revision = ''
186  else:
187    old_tree = create_tree_from_workdir(changed_lines)
188    revision = None
189  new_tree = run_clang_format_and_save_to_tree(changed_lines,
190                                               revision,
191                                               binary=opts.binary,
192                                               style=opts.style)
193  if opts.verbose >= 1:
194    print('old tree: %s' % old_tree)
195    print('new tree: %s' % new_tree)
196
197  if old_tree == new_tree:
198    if opts.verbose >= 0:
199      print('clang-format did not modify any files')
200    return 0
201
202  if opts.diff:
203    return print_diff(old_tree, new_tree)
204  if opts.diffstat:
205    return print_diffstat(old_tree, new_tree)
206
207  changed_files = apply_changes(old_tree, new_tree, force=opts.force,
208                                patch_mode=opts.patch)
209  if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1:
210    print('changed files:')
211    for filename in changed_files:
212      print('    %s' % filename)
213
214  return 1
215
216
217def load_git_config(non_string_options=None):
218  """Return the git configuration as a dictionary.
219
220  All options are assumed to be strings unless in `non_string_options`, in which
221  is a dictionary mapping option name (in lower case) to either "--bool" or
222  "--int"."""
223  if non_string_options is None:
224    non_string_options = {}
225  out = {}
226  for entry in run('git', 'config', '--list', '--null').split('\0'):
227    if entry:
228      if '\n' in entry:
229        name, value = entry.split('\n', 1)
230      else:
231        # A setting with no '=' ('\n' with --null) is implicitly 'true'
232        name = entry
233        value = 'true'
234      if name in non_string_options:
235        value = run('git', 'config', non_string_options[name], name)
236      out[name] = value
237  return out
238
239
240def interpret_args(args, dash_dash, default_commit):
241  """Interpret `args` as "[commits] [--] [files]" and return (commits, files).
242
243  It is assumed that "--" and everything that follows has been removed from
244  args and placed in `dash_dash`.
245
246  If "--" is present (i.e., `dash_dash` is non-empty), the arguments to its
247  left (if present) are taken as commits.  Otherwise, the arguments are checked
248  from left to right if they are commits or files.  If commits are not given,
249  a list with `default_commit` is used."""
250  if dash_dash:
251    if len(args) == 0:
252      commits = [default_commit]
253    else:
254      commits = args
255    for commit in commits:
256      object_type = get_object_type(commit)
257      if object_type not in ('commit', 'tag'):
258        if object_type is None:
259          die("'%s' is not a commit" % commit)
260        else:
261          die("'%s' is a %s, but a commit was expected" % (commit, object_type))
262    files = dash_dash[1:]
263  elif args:
264    commits = []
265    while args:
266      if not disambiguate_revision(args[0]):
267        break
268      commits.append(args.pop(0))
269    if not commits:
270      commits = [default_commit]
271    files = args
272  else:
273    commits = [default_commit]
274    files = []
275  return commits, files
276
277
278def disambiguate_revision(value):
279  """Returns True if `value` is a revision, False if it is a file, or dies."""
280  # If `value` is ambiguous (neither a commit nor a file), the following
281  # command will die with an appropriate error message.
282  run('git', 'rev-parse', value, verbose=False)
283  object_type = get_object_type(value)
284  if object_type is None:
285    return False
286  if object_type in ('commit', 'tag'):
287    return True
288  die('`%s` is a %s, but a commit or filename was expected' %
289      (value, object_type))
290
291
292def get_object_type(value):
293  """Returns a string description of an object's type, or None if it is not
294  a valid git object."""
295  cmd = ['git', 'cat-file', '-t', value]
296  p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
297  stdout, stderr = p.communicate()
298  if p.returncode != 0:
299    return None
300  return convert_string(stdout.strip())
301
302
303def compute_diff_and_extract_lines(commits, files, staged):
304  """Calls compute_diff() followed by extract_lines()."""
305  diff_process = compute_diff(commits, files, staged)
306  changed_lines = extract_lines(diff_process.stdout)
307  diff_process.stdout.close()
308  diff_process.wait()
309  if diff_process.returncode != 0:
310    # Assume error was already printed to stderr.
311    sys.exit(2)
312  return changed_lines
313
314
315def compute_diff(commits, files, staged):
316  """Return a subprocess object producing the diff from `commits`.
317
318  The return value's `stdin` file object will produce a patch with the
319  differences between the working directory (or stage if --staged is used) and
320  the first commit if a single one was specified, or the difference between
321  both specified commits, filtered on `files` (if non-empty).
322  Zero context lines are used in the patch."""
323  git_tool = 'diff-index'
324  extra_args = []
325  if len(commits) > 1:
326    git_tool = 'diff-tree'
327  elif staged:
328    extra_args += ['--cached']
329  cmd = ['git', git_tool, '-p', '-U0'] + extra_args + commits + ['--']
330  cmd.extend(files)
331  p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
332  p.stdin.close()
333  return p
334
335
336def extract_lines(patch_file):
337  """Extract the changed lines in `patch_file`.
338
339  The return value is a dictionary mapping filename to a list of (start_line,
340  line_count) pairs.
341
342  The input must have been produced with ``-U0``, meaning unidiff format with
343  zero lines of context.  The return value is a dict mapping filename to a
344  list of line `Range`s."""
345  matches = {}
346  for line in patch_file:
347    line = convert_string(line)
348    match = re.search(r'^\+\+\+\ [^/]+/(.*)', line)
349    if match:
350      filename = match.group(1).rstrip('\r\n')
351    match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line)
352    if match:
353      start_line = int(match.group(1))
354      line_count = 1
355      if match.group(3):
356        line_count = int(match.group(3))
357      if line_count == 0:
358        line_count = 1
359      if start_line == 0:
360        continue
361      matches.setdefault(filename, []).append(Range(start_line, line_count))
362  return matches
363
364
365def filter_by_extension(dictionary, allowed_extensions):
366  """Delete every key in `dictionary` that doesn't have an allowed extension.
367
368  `allowed_extensions` must be a collection of lowercase file extensions,
369  excluding the period."""
370  allowed_extensions = frozenset(allowed_extensions)
371  for filename in list(dictionary.keys()):
372    base_ext = filename.rsplit('.', 1)
373    if len(base_ext) == 1 and '' in allowed_extensions:
374        continue
375    if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions:
376      del dictionary[filename]
377
378
379def filter_symlinks(dictionary):
380  """Delete every key in `dictionary` that is a symlink."""
381  for filename in list(dictionary.keys()):
382    if os.path.islink(filename):
383      del dictionary[filename]
384
385
386def cd_to_toplevel():
387  """Change to the top level of the git repository."""
388  toplevel = run('git', 'rev-parse', '--show-toplevel')
389  os.chdir(toplevel)
390
391
392def create_tree_from_workdir(filenames):
393  """Create a new git tree with the given files from the working directory.
394
395  Returns the object ID (SHA-1) of the created tree."""
396  return create_tree(filenames, '--stdin')
397
398
399def create_tree_from_index(filenames):
400  # Copy the environment, because the files have to be read from the original
401  # index.
402  env = os.environ.copy()
403  def index_contents_generator():
404    for filename in filenames:
405      git_ls_files_cmd = ['git', 'ls-files', '--stage', '-z', '--', filename]
406      git_ls_files = subprocess.Popen(git_ls_files_cmd, env=env,
407                                      stdin=subprocess.PIPE,
408                                      stdout=subprocess.PIPE)
409      stdout = git_ls_files.communicate()[0]
410      yield convert_string(stdout.split(b'\0')[0])
411  return create_tree(index_contents_generator(), '--index-info')
412
413
414def run_clang_format_and_save_to_tree(changed_lines, revision=None,
415                                      binary='clang-format', style=None):
416  """Run clang-format on each file and save the result to a git tree.
417
418  Returns the object ID (SHA-1) of the created tree."""
419  # Copy the environment when formatting the files in the index, because the
420  # files have to be read from the original index.
421  env = os.environ.copy() if revision == '' else None
422  def iteritems(container):
423      try:
424          return container.iteritems() # Python 2
425      except AttributeError:
426          return container.items() # Python 3
427  def index_info_generator():
428    for filename, line_ranges in iteritems(changed_lines):
429      if revision is not None:
430        if len(revision) > 0:
431          git_metadata_cmd = ['git', 'ls-tree',
432                              '%s:%s' % (revision, os.path.dirname(filename)),
433                              os.path.basename(filename)]
434        else:
435          git_metadata_cmd = ['git', 'ls-files', '--stage', '--', filename]
436        git_metadata = subprocess.Popen(git_metadata_cmd, env=env,
437                                        stdin=subprocess.PIPE,
438                                        stdout=subprocess.PIPE)
439        stdout = git_metadata.communicate()[0]
440        mode = oct(int(stdout.split()[0], 8))
441      else:
442        mode = oct(os.stat(filename).st_mode)
443      # Adjust python3 octal format so that it matches what git expects
444      if mode.startswith('0o'):
445          mode = '0' + mode[2:]
446      blob_id = clang_format_to_blob(filename, line_ranges,
447                                     revision=revision,
448                                     binary=binary,
449                                     style=style,
450                                     env=env)
451      yield '%s %s\t%s' % (mode, blob_id, filename)
452  return create_tree(index_info_generator(), '--index-info')
453
454
455def create_tree(input_lines, mode):
456  """Create a tree object from the given input.
457
458  If mode is '--stdin', it must be a list of filenames.  If mode is
459  '--index-info' is must be a list of values suitable for "git update-index
460  --index-info", such as "<mode> <SP> <sha1> <TAB> <filename>".  Any other mode
461  is invalid."""
462  assert mode in ('--stdin', '--index-info')
463  cmd = ['git', 'update-index', '--add', '-z', mode]
464  with temporary_index_file():
465    p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
466    for line in input_lines:
467      p.stdin.write(to_bytes('%s\0' % line))
468    p.stdin.close()
469    if p.wait() != 0:
470      die('`%s` failed' % ' '.join(cmd))
471    tree_id = run('git', 'write-tree')
472    return tree_id
473
474
475def clang_format_to_blob(filename, line_ranges, revision=None,
476                         binary='clang-format', style=None, env=None):
477  """Run clang-format on the given file and save the result to a git blob.
478
479  Runs on the file in `revision` if not None, or on the file in the working
480  directory if `revision` is None. Revision can be set to an empty string to run
481  clang-format on the file in the index.
482
483  Returns the object ID (SHA-1) of the created blob."""
484  clang_format_cmd = [binary]
485  if style:
486    clang_format_cmd.extend(['-style='+style])
487  clang_format_cmd.extend([
488      '-lines=%s:%s' % (start_line, start_line+line_count-1)
489      for start_line, line_count in line_ranges])
490  if revision is not None:
491    clang_format_cmd.extend(['-assume-filename='+filename])
492    git_show_cmd = ['git', 'cat-file', 'blob', '%s:%s' % (revision, filename)]
493    git_show = subprocess.Popen(git_show_cmd, env=env, stdin=subprocess.PIPE,
494                                stdout=subprocess.PIPE)
495    git_show.stdin.close()
496    clang_format_stdin = git_show.stdout
497  else:
498    clang_format_cmd.extend([filename])
499    git_show = None
500    clang_format_stdin = subprocess.PIPE
501  try:
502    clang_format = subprocess.Popen(clang_format_cmd, stdin=clang_format_stdin,
503                                    stdout=subprocess.PIPE)
504    if clang_format_stdin == subprocess.PIPE:
505      clang_format_stdin = clang_format.stdin
506  except OSError as e:
507    if e.errno == errno.ENOENT:
508      die('cannot find executable "%s"' % binary)
509    else:
510      raise
511  clang_format_stdin.close()
512  hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin']
513  hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout,
514                                 stdout=subprocess.PIPE)
515  clang_format.stdout.close()
516  stdout = hash_object.communicate()[0]
517  if hash_object.returncode != 0:
518    die('`%s` failed' % ' '.join(hash_object_cmd))
519  if clang_format.wait() != 0:
520    die('`%s` failed' % ' '.join(clang_format_cmd))
521  if git_show and git_show.wait() != 0:
522    die('`%s` failed' % ' '.join(git_show_cmd))
523  return convert_string(stdout).rstrip('\r\n')
524
525
526@contextlib.contextmanager
527def temporary_index_file(tree=None):
528  """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting
529  the file afterward."""
530  index_path = create_temporary_index(tree)
531  old_index_path = os.environ.get('GIT_INDEX_FILE')
532  os.environ['GIT_INDEX_FILE'] = index_path
533  try:
534    yield
535  finally:
536    if old_index_path is None:
537      del os.environ['GIT_INDEX_FILE']
538    else:
539      os.environ['GIT_INDEX_FILE'] = old_index_path
540    os.remove(index_path)
541
542
543def create_temporary_index(tree=None):
544  """Create a temporary index file and return the created file's path.
545
546  If `tree` is not None, use that as the tree to read in.  Otherwise, an
547  empty index is created."""
548  gitdir = run('git', 'rev-parse', '--git-dir')
549  path = os.path.join(gitdir, temp_index_basename)
550  if tree is None:
551    tree = '--empty'
552  run('git', 'read-tree', '--index-output='+path, tree)
553  return path
554
555
556def print_diff(old_tree, new_tree):
557  """Print the diff between the two trees to stdout."""
558  # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output
559  # is expected to be viewed by the user, and only the former does nice things
560  # like color and pagination.
561  #
562  # We also only print modified files since `new_tree` only contains the files
563  # that were modified, so unmodified files would show as deleted without the
564  # filter.
565  return subprocess.run(['git', 'diff', '--diff-filter=M',
566                         '--exit-code', old_tree, new_tree]).returncode
567
568def print_diffstat(old_tree, new_tree):
569  """Print the diffstat between the two trees to stdout."""
570  # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output
571  # is expected to be viewed by the user, and only the former does nice things
572  # like color and pagination.
573  #
574  # We also only print modified files since `new_tree` only contains the files
575  # that were modified, so unmodified files would show as deleted without the
576  # filter.
577  return subprocess.run(['git', 'diff', '--diff-filter=M', '--exit-code',
578                         '--stat', old_tree, new_tree]).returncode
579
580def apply_changes(old_tree, new_tree, force=False, patch_mode=False):
581  """Apply the changes in `new_tree` to the working directory.
582
583  Bails if there are local changes in those files and not `force`.  If
584  `patch_mode`, runs `git checkout --patch` to select hunks interactively."""
585  changed_files = run('git', 'diff-tree', '--diff-filter=M', '-r', '-z',
586                      '--name-only', old_tree,
587                      new_tree).rstrip('\0').split('\0')
588  if not force:
589    unstaged_files = run('git', 'diff-files', '--name-status', *changed_files)
590    if unstaged_files:
591      print('The following files would be modified but '
592                'have unstaged changes:', file=sys.stderr)
593      print(unstaged_files, file=sys.stderr)
594      print('Please commit, stage, or stash them first.', file=sys.stderr)
595      sys.exit(2)
596  if patch_mode:
597    # In patch mode, we could just as well create an index from the new tree
598    # and checkout from that, but then the user will be presented with a
599    # message saying "Discard ... from worktree".  Instead, we use the old
600    # tree as the index and checkout from new_tree, which gives the slightly
601    # better message, "Apply ... to index and worktree".  This is not quite
602    # right, since it won't be applied to the user's index, but oh well.
603    with temporary_index_file(old_tree):
604      subprocess.run(['git', 'checkout', '--patch', new_tree], check=True)
605    index_tree = old_tree
606  else:
607    with temporary_index_file(new_tree):
608      run('git', 'checkout-index', '-a', '-f')
609  return changed_files
610
611
612def run(*args, **kwargs):
613  stdin = kwargs.pop('stdin', '')
614  verbose = kwargs.pop('verbose', True)
615  strip = kwargs.pop('strip', True)
616  for name in kwargs:
617    raise TypeError("run() got an unexpected keyword argument '%s'" % name)
618  p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
619                       stdin=subprocess.PIPE)
620  stdout, stderr = p.communicate(input=stdin)
621
622  stdout = convert_string(stdout)
623  stderr = convert_string(stderr)
624
625  if p.returncode == 0:
626    if stderr:
627      if verbose:
628        print('`%s` printed to stderr:' % ' '.join(args), file=sys.stderr)
629      print(stderr.rstrip(), file=sys.stderr)
630    if strip:
631      stdout = stdout.rstrip('\r\n')
632    return stdout
633  if verbose:
634    print('`%s` returned %s' % (' '.join(args), p.returncode), file=sys.stderr)
635  if stderr:
636    print(stderr.rstrip(), file=sys.stderr)
637  sys.exit(2)
638
639
640def die(message):
641  print('error:', message, file=sys.stderr)
642  sys.exit(2)
643
644
645def to_bytes(str_input):
646    # Encode to UTF-8 to get binary data.
647    if isinstance(str_input, bytes):
648        return str_input
649    return str_input.encode('utf-8')
650
651
652def to_string(bytes_input):
653    if isinstance(bytes_input, str):
654        return bytes_input
655    return bytes_input.encode('utf-8')
656
657
658def convert_string(bytes_input):
659    try:
660        return to_string(bytes_input.decode('utf-8'))
661    except AttributeError: # 'str' object has no attribute 'decode'.
662        return str(bytes_input)
663    except UnicodeError:
664        return str(bytes_input)
665
666if __name__ == '__main__':
667  sys.exit(main())
668