tools/buildgen/extract_metadata_from_bazel_xml.py

#!/usr/bin/env python3
# Copyright 2020 The gRPC Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Script to extract build metadata from bazel BUILD.
# To avoid having two sources of truth for the build metadata (build
# targets, source files, header files etc.), this script analyzes the contents
# of bazel BUILD files and generates a YAML file (currently called
# build_autogenerated.yaml). The format and semantics of the generated YAML files
# is chosen to match the format of a "build.yaml" file, which used
# to be build the source of truth for gRPC build before bazel became
# the primary build system.
# A good basic overview of the "build.yaml" format is available here:
# https://github.com/grpc/grpc/blob/master/templates/README.md. Note that
# while useful as an overview, the doc does not act as formal spec
# (formal spec does not exist in fact) and the doc can be incomplete,
# inaccurate or slightly out of date.
# TODO(jtattermusch): In the future we want to get rid of the legacy build.yaml
# format entirely or simplify it to a point where it becomes self-explanatory
# and doesn't need any detailed documentation.

import collections
import os
import re
import subprocess
import sys
from typing import Any, Dict, Iterable, List, Optional
import xml.etree.ElementTree as ET

import build_cleaner
import yaml

BuildMetadata = Dict[str, Any]
BuildDict = Dict[str, BuildMetadata]
BuildYaml = Dict[str, Any]


def _bazel_query_xml_tree(query: str) -> ET.Element:
    """Get xml output of bazel query invocation, parsed as XML tree"""
    output = subprocess.check_output(
        ['tools/bazel', 'query', '--noimplicit_deps', '--output', 'xml', query])
    return ET.fromstring(output)


def _rule_dict_from_xml_node(rule_xml_node):
    """Converts XML node representing a rule (obtained from "bazel query --output xml") to a dictionary that contains all the metadata we will need."""
    result = {
        'class': rule_xml_node.attrib.get('class'),
        'name': rule_xml_node.attrib.get('name'),
        'srcs': [],
        'hdrs': [],
        'deps': [],
        'data': [],
        'tags': [],
        'args': [],
        'generator_function': None,
        'size': None,
        'flaky': False,
    }
    for child in rule_xml_node:
        # all the metadata we want is stored under "list" tags
        if child.tag == 'list':
            list_name = child.attrib['name']
            if list_name in ['srcs', 'hdrs', 'deps', 'data', 'tags', 'args']:
                result[list_name] += [item.attrib['value'] for item in child]
        if child.tag == 'string':
            string_name = child.attrib['name']
            if string_name in ['generator_function', 'size']:
                result[string_name] = child.attrib['value']
        if child.tag == 'boolean':
            bool_name = child.attrib['name']
            if bool_name in ['flaky']:
                result[bool_name] = child.attrib['value'] == 'true'
    return result


def _extract_rules_from_bazel_xml(xml_tree):
    """Extract bazel rules from an XML tree node obtained from "bazel query --output xml" command."""
    result = {}
    for child in xml_tree:
        if child.tag == 'rule':
            rule_dict = _rule_dict_from_xml_node(child)
            rule_clazz = rule_dict['class']
            rule_name = rule_dict['name']
            if rule_clazz in [
                    'cc_library',
                    'cc_binary',
                    'cc_test',
                    'cc_proto_library',
                    'proto_library',
                    'upb_proto_library',
                    'upb_proto_reflection_library',
            ]:
                if rule_name in result:
                    raise Exception('Rule %s already present' % rule_name)
                result[rule_name] = rule_dict
    return result


def _get_bazel_label(target_name: str) -> str:
    if ':' in target_name:
        return '//%s' % target_name
    else:
        return '//:%s' % target_name


def _extract_source_file_path(label: str) -> str:
    """Gets relative path to source file from bazel deps listing"""
    if label.startswith('//'):
        label = label[len('//'):]
    # labels in form //:src/core/lib/surface/call_test_only.h
    if label.startswith(':'):
        label = label[len(':'):]
    # labels in form //test/core/util:port.cc
    label = label.replace(':', '/')
    return label


def _extract_public_headers(bazel_rule: BuildMetadata) -> List[str]:
    """Gets list of public headers from a bazel rule"""
    result = []
    for dep in bazel_rule['hdrs']:
        if dep.startswith('//:include/') and dep.endswith('.h'):
            result.append(_extract_source_file_path(dep))
    return list(sorted(result))


def _extract_nonpublic_headers(bazel_rule: BuildMetadata) -> List[str]:
    """Gets list of non-public headers from a bazel rule"""
    result = []
    for dep in bazel_rule['hdrs']:
        if dep.startswith('//') and not dep.startswith(
                '//:include/') and dep.endswith('.h'):
            result.append(_extract_source_file_path(dep))
    return list(sorted(result))


def _extract_sources(bazel_rule: BuildMetadata) -> List[str]:
    """Gets list of source files from a bazel rule"""
    result = []
    for dep in bazel_rule['srcs']:
        if dep.startswith('//') and (dep.endswith('.cc') or dep.endswith('.c')
                                     or dep.endswith('.proto')):
            result.append(_extract_source_file_path(dep))
    return list(sorted(result))


def _extract_deps(bazel_rule: BuildMetadata,
                  bazel_rules: BuildDict) -> List[str]:
    """Gets list of deps from from a bazel rule"""
    return list(sorted(bazel_rule['deps']))


def _create_target_from_bazel_rule(target_name: str,
                                   bazel_rules: BuildDict) -> BuildMetadata:
    """Create build.yaml-like target definition from bazel metadata"""
    bazel_rule = bazel_rules[_get_bazel_label(target_name)]

    # Create a template for our target from the bazel rule. Initially we only
    # populate some "private" fields with the original info we got from bazel
    # and only later we will populate the public fields (once we do some extra
    # postprocessing).
    result = {
        'name': target_name,
        '_PUBLIC_HEADERS_BAZEL': _extract_public_headers(bazel_rule),
        '_HEADERS_BAZEL': _extract_nonpublic_headers(bazel_rule),
        '_SRC_BAZEL': _extract_sources(bazel_rule),
        '_DEPS_BAZEL': _extract_deps(bazel_rule, bazel_rules),
        'public_headers': bazel_rule['_COLLAPSED_PUBLIC_HEADERS'],
        'headers': bazel_rule['_COLLAPSED_HEADERS'],
        'src': bazel_rule['_COLLAPSED_SRCS'],
        'deps': bazel_rule['_COLLAPSED_DEPS'],
    }
    return result


def _external_dep_name_from_bazel_dependency(bazel_dep: str) -> Optional[str]:
    """Returns name of dependency if external bazel dependency is provided or None"""
    if bazel_dep.startswith('@com_google_absl//'):
        # special case for add dependency on one of the absl libraries (there is not just one absl library)
        prefixlen = len('@com_google_absl//')
        return bazel_dep[prefixlen:]
    elif bazel_dep == '//external:upb_lib':
        return 'upb'
    elif bazel_dep == '//external:benchmark':
        return 'benchmark'
    elif bazel_dep == '//external:libssl':
        return 'libssl'
    else:
        # all the other external deps such as protobuf, cares, zlib
        # don't need to be listed explicitly, they are handled automatically
        # by the build system (make, cmake)
        return None


def _compute_transitive_metadata(
        rule_name: str, bazel_rules: Any,
        bazel_label_to_dep_name: Dict[str, str]) -> None:
    """Computes the final build metadata for Bazel target with rule_name.

    The dependencies that will appear on the deps list are:

    * Public build targets including binaries and tests;
    * External targets, like absl, re2.

    All other intermediate dependencies will be merged, which means their
    source file, headers, etc. will be collected into one build target. This
    step of processing will greatly reduce the complexity of the generated
    build specifications for other build systems, like CMake, Make, setuptools.

    The final build metadata are:
    * _TRANSITIVE_DEPS: all the transitive dependencies including intermediate
                        targets;
    * _COLLAPSED_DEPS:  dependencies that fits our requirement above, and it
                        will remove duplicated items and produce the shortest
                        possible dependency list in alphabetical order;
    * _COLLAPSED_SRCS:  the merged source files;
    * _COLLAPSED_PUBLIC_HEADERS: the merged public headers;
    * _COLLAPSED_HEADERS: the merged non-public headers;
    * _EXCLUDE_DEPS: intermediate targets to exclude when performing collapsing
      of sources and dependencies.

    For the collapsed_deps, the algorithm improved cases like:

    The result in the past:
        end2end_tests -> [grpc_test_util, grpc, gpr, address_sorting, upb]
        grpc_test_util -> [grpc, gpr, address_sorting, upb, ...]
        grpc -> [gpr, address_sorting, upb, ...]

    The result of the algorithm:
        end2end_tests -> [grpc_test_util]
        grpc_test_util -> [grpc]
        grpc -> [gpr, address_sorting, upb, ...]
    """
    bazel_rule = bazel_rules[rule_name]
    direct_deps = _extract_deps(bazel_rule, bazel_rules)
    transitive_deps = set()
    collapsed_deps = set()
    exclude_deps = set()
    collapsed_srcs = set(_extract_sources(bazel_rule))
    collapsed_public_headers = set(_extract_public_headers(bazel_rule))
    collapsed_headers = set(_extract_nonpublic_headers(bazel_rule))

    for dep in direct_deps:
        external_dep_name_maybe = _external_dep_name_from_bazel_dependency(dep)

        if dep in bazel_rules:
            # Descend recursively, but no need to do that for external deps
            if external_dep_name_maybe is None:
                if "_PROCESSING_DONE" not in bazel_rules[dep]:
                    # This item is not processed before, compute now
                    _compute_transitive_metadata(dep, bazel_rules,
                                                 bazel_label_to_dep_name)

                transitive_deps.update(bazel_rules[dep].get(
                    '_TRANSITIVE_DEPS', []))
                collapsed_deps.update(
                    collapsed_deps, bazel_rules[dep].get('_COLLAPSED_DEPS', []))
                exclude_deps.update(bazel_rules[dep].get('_EXCLUDE_DEPS', []))

        # This dep is a public target, add it as a dependency
        if dep in bazel_label_to_dep_name:
            transitive_deps.update([bazel_label_to_dep_name[dep]])
            collapsed_deps.update(collapsed_deps,
                                  [bazel_label_to_dep_name[dep]])
            # Add all the transitive deps of our every public dep to exclude
            # list since we want to avoid building sources that are already
            # built by our dependencies
            exclude_deps.update(bazel_rules[dep]['_TRANSITIVE_DEPS'])
            continue

        # This dep is an external target, add it as a dependency
        if external_dep_name_maybe is not None:
            transitive_deps.update([external_dep_name_maybe])
            collapsed_deps.update(collapsed_deps, [external_dep_name_maybe])
            continue

    # Direct dependencies are part of transitive dependencies
    transitive_deps.update(direct_deps)

    # Calculate transitive public deps (needed for collapsing sources)
    transitive_public_deps = set(
        filter(lambda x: x in bazel_label_to_dep_name, transitive_deps))

    # Remove intermediate targets that our public dependencies already depend
    # on. This is the step that further shorten the deps list.
    collapsed_deps = set(filter(lambda x: x not in exclude_deps,
                                collapsed_deps))

    # Compute the final source files and headers for this build target whose
    # name is `rule_name` (input argument of this function).
    #
    # Imaging a public target PX has transitive deps [IA, IB, PY, IC, PZ]. PX,
    # PY and PZ are public build targets. And IA, IB, IC are intermediate
    # targets. In addition, PY depends on IC.
    #
    # Translate the condition into dependency graph:
    #   PX -> [IA, IB, PY, IC, PZ]
    #   PY -> [IC]
    #   Public targets: [PX, PY, PZ]
    #
    # The collapsed dependencies of PX: [PY, PZ].
    # The excluded dependencies of X: [PY, IC, PZ].
    # (IC is excluded as a dependency of PX. It is already included in PY, hence
    # it would be redundant to include it again.)
    #
    # Target PX should include source files and headers of [PX, IA, IB] as final
    # build metadata.
    for dep in transitive_deps:
        if dep not in exclude_deps and dep not in transitive_public_deps:
            if dep in bazel_rules:
                collapsed_srcs.update(_extract_sources(bazel_rules[dep]))
                collapsed_public_headers.update(
                    _extract_public_headers(bazel_rules[dep]))
                collapsed_headers.update(
                    _extract_nonpublic_headers(bazel_rules[dep]))
    # This item is a "visited" flag
    bazel_rule['_PROCESSING_DONE'] = True
    # Following items are described in the docstinrg.
    bazel_rule['_TRANSITIVE_DEPS'] = list(sorted(transitive_deps))
    bazel_rule['_COLLAPSED_DEPS'] = list(sorted(collapsed_deps))
    bazel_rule['_COLLAPSED_SRCS'] = list(sorted(collapsed_srcs))
    bazel_rule['_COLLAPSED_PUBLIC_HEADERS'] = list(
        sorted(collapsed_public_headers))
    bazel_rule['_COLLAPSED_HEADERS'] = list(sorted(collapsed_headers))
    bazel_rule['_EXCLUDE_DEPS'] = list(sorted(exclude_deps))


# TODO(jtattermusch): deduplicate with transitive_dependencies.py (which has a slightly different logic)
# TODO(jtattermusch): This is done to avoid introducing too many intermediate
# libraries into the build.yaml-based builds (which might in cause issues
# building language-specific artifacts) and also because the libraries
# in build.yaml-based build are generally considered units of distributions
# (= public libraries that are visible to the user and are installable),
# while in bazel builds it is customary to define larger number of smaller
# "sublibraries". The need for elision (and expansion)
# of intermediate libraries can be re-evaluated in the future.
def _populate_transitive_metadata(bazel_rules: Any,
                                  public_dep_names: Iterable[str]) -> None:
    """Add 'transitive_deps' field for each of the rules"""
    # Create the map between Bazel label and public dependency name
    bazel_label_to_dep_name = {}
    for dep_name in public_dep_names:
        bazel_label_to_dep_name[_get_bazel_label(dep_name)] = dep_name

    # Make sure we reached all the Bazel rules
    # TODO(lidiz) potentially we could only update a subset of rules
    for rule_name in bazel_rules:
        if '_PROCESSING_DONE' not in bazel_rules[rule_name]:
            _compute_transitive_metadata(rule_name, bazel_rules,
                                         bazel_label_to_dep_name)


def update_test_metadata_with_transitive_metadata(
        all_extra_metadata: BuildDict, bazel_rules: BuildDict) -> None:
    """Patches test build metadata with transitive metadata."""
    for lib_name, lib_dict in all_extra_metadata.items():
        # Skip if it isn't not an test
        if lib_dict.get('build') != 'test' or lib_dict.get('_TYPE') != 'target':
            continue

        bazel_rule = bazel_rules[_get_bazel_label(lib_name)]

        if '//external:benchmark' in bazel_rule['_TRANSITIVE_DEPS']:
            lib_dict['benchmark'] = True
            lib_dict['defaults'] = 'benchmark'

        if '//external:gtest' in bazel_rule['_TRANSITIVE_DEPS']:
            lib_dict['gtest'] = True
            lib_dict['language'] = 'c++'


def _expand_upb_proto_library_rules(bazel_rules):
    # Expand the .proto files from UPB proto library rules into the pre-generated
    # upb.h and upb.c files.
    GEN_UPB_ROOT = '//:src/core/ext/upb-generated/'
    GEN_UPBDEFS_ROOT = '//:src/core/ext/upbdefs-generated/'
    EXTERNAL_LINKS = [
        ('@com_google_protobuf//', ':src/'),
    ]
    for name, bazel_rule in bazel_rules.items():
        gen_func = bazel_rule.get('generator_function', None)
        if gen_func in ('grpc_upb_proto_library',
                        'grpc_upb_proto_reflection_library'):
            # get proto dependency
            deps = bazel_rule['deps']
            if len(deps) != 1:
                raise Exception(
                    'upb rule "{0}" should have 1 proto dependency but has "{1}"'
                    .format(name, deps))
            proto_dep = deps[0]
            proto_rule = bazel_rules.get(proto_dep, None)
            if proto_rule is None:
                raise Exception(
                    'upb rule "{0}"\'s dependency "{1}" is not found'.format(
                        name, proto_rule))
            # deps is not properly fetched from bazel query for upb_proto_library target
            # so add the upb dependency manually
            bazel_rule['deps'] = [
                '//external:upb_lib', '//external:upb_lib_descriptor',
                '//external:upb_generated_code_support__only_for_generated_code_do_not_use__i_give_permission_to_break_me'
            ]
            # populate the upb_proto_library rule with pre-generated upb headers
            # and sources using proto_rule
            srcs = []
            hdrs = []
            for proto_src in proto_rule['srcs']:
                for external_link in EXTERNAL_LINKS:
                    if proto_src.startswith(external_link[0]):
                        proto_src = proto_src[len(external_link[0]) +
                                              len(external_link[1]):]
                        break
                proto_src = _extract_source_file_path(proto_src)
                ext = '.upb' if gen_func == 'grpc_upb_proto_library' else '.upbdefs'
                root = GEN_UPB_ROOT if gen_func == 'grpc_upb_proto_library' else GEN_UPBDEFS_ROOT
                srcs.append(root + proto_src.replace('.proto', ext + '.c'))
                hdrs.append(root + proto_src.replace('.proto', ext + '.h'))
            bazel_rule['srcs'] = srcs
            bazel_rule['hdrs'] = hdrs


def _generate_build_metadata(build_extra_metadata: BuildDict,
                             bazel_rules: BuildDict) -> BuildDict:
    """Generate build metadata in build.yaml-like format bazel build metadata and build.yaml-specific "extra metadata"."""
    lib_names = list(build_extra_metadata.keys())
    result = {}

    for lib_name in lib_names:
        lib_dict = _create_target_from_bazel_rule(lib_name, bazel_rules)

        # populate extra properties from the build.yaml-specific "extra metadata"
        lib_dict.update(build_extra_metadata.get(lib_name, {}))

        # store to results
        result[lib_name] = lib_dict

    # Rename targets marked with "_RENAME" extra metadata.
    # This is mostly a cosmetic change to ensure that we end up with build.yaml target
    # names we're used to from the past (and also to avoid too long target names).
    # The rename step needs to be made after we're done with most of processing logic
    # otherwise the already-renamed libraries will have different names than expected
    for lib_name in lib_names:
        to_name = build_extra_metadata.get(lib_name, {}).get('_RENAME', None)
        if to_name:
            # store lib under the new name and also change its 'name' property
            if to_name in result:
                raise Exception('Cannot rename target ' + str(lib_name) + ', ' +
                                str(to_name) + ' already exists.')
            lib_dict = result.pop(lib_name)
            lib_dict['name'] = to_name
            result[to_name] = lib_dict

            # dep names need to be updated as well
            for lib_dict_to_update in result.values():
                lib_dict_to_update['deps'] = list([
                    to_name if dep == lib_name else dep
                    for dep in lib_dict_to_update['deps']
                ])

    return result


def _convert_to_build_yaml_like(lib_dict: BuildMetadata) -> BuildYaml:
    lib_names = [
        lib_name for lib_name in list(lib_dict.keys())
        if lib_dict[lib_name].get('_TYPE', 'library') == 'library'
    ]
    target_names = [
        lib_name for lib_name in list(lib_dict.keys())
        if lib_dict[lib_name].get('_TYPE', 'library') == 'target'
    ]
    test_names = [
        lib_name for lib_name in list(lib_dict.keys())
        if lib_dict[lib_name].get('_TYPE', 'library') == 'test'
    ]

    # list libraries and targets in predefined order
    lib_list = [lib_dict[lib_name] for lib_name in lib_names]
    target_list = [lib_dict[lib_name] for lib_name in target_names]
    test_list = [lib_dict[lib_name] for lib_name in test_names]

    # get rid of temporary private fields prefixed with "_" and some other useless fields
    for lib in lib_list:
        for field_to_remove in [k for k in lib.keys() if k.startswith('_')]:
            lib.pop(field_to_remove, None)
    for target in target_list:
        for field_to_remove in [k for k in target.keys() if k.startswith('_')]:
            target.pop(field_to_remove, None)
        target.pop('public_headers',
                   None)  # public headers make no sense for targets
    for test in test_list:
        for field_to_remove in [k for k in test.keys() if k.startswith('_')]:
            test.pop(field_to_remove, None)
        test.pop('public_headers',
                 None)  # public headers make no sense for tests

    build_yaml_like = {
        'libs': lib_list,
        'filegroups': [],
        'targets': target_list,
        'tests': test_list,
    }
    return build_yaml_like


def _extract_cc_tests(bazel_rules: BuildDict) -> List[str]:
    """Gets list of cc_test tests from bazel rules"""
    result = []
    for bazel_rule in bazel_rules.values():
        if bazel_rule['class'] == 'cc_test':
            test_name = bazel_rule['name']
            if test_name.startswith('//'):
                prefixlen = len('//')
                result.append(test_name[prefixlen:])
    return list(sorted(result))


def _exclude_unwanted_cc_tests(tests: List[str]) -> List[str]:
    """Filters out bazel tests that we don't want to run with other build systems or we cannot build them reasonably"""

    # most qps tests are autogenerated, we are fine without them
    tests = [test for test in tests if not test.startswith('test/cpp/qps:')]
    # microbenchmarks aren't needed for checking correctness
    tests = [
        test for test in tests
        if not test.startswith('test/cpp/microbenchmarks:')
    ]
    tests = [
        test for test in tests
        if not test.startswith('test/core/promise/benchmark:')
    ]

    # we have trouble with census dependency outside of bazel
    tests = [
        test for test in tests
        if not test.startswith('test/cpp/ext/filters/census:') and
        not test.startswith('test/core/xds:xds_channel_stack_modifier_test')
    ]

    # missing opencensus/stats/stats.h
    tests = [
        test for test in tests if not test.startswith(
            'test/cpp/end2end:server_load_reporting_end2end_test')
    ]
    tests = [
        test for test in tests if not test.startswith(
            'test/cpp/server/load_reporter:lb_load_reporter_test')
    ]

    # The test uses --running_under_bazel cmdline argument
    # To avoid the trouble needing to adjust it, we just skip the test
    tests = [
        test for test in tests if not test.startswith(
            'test/cpp/naming:resolver_component_tests_runner_invoker')
    ]

    # the test requires 'client_crash_test_server' to be built
    tests = [
        test for test in tests
        if not test.startswith('test/cpp/end2end:time_change_test')
    ]

    # the test requires 'client_crash_test_server' to be built
    tests = [
        test for test in tests
        if not test.startswith('test/cpp/end2end:client_crash_test')
    ]

    # the test requires 'server_crash_test_client' to be built
    tests = [
        test for test in tests
        if not test.startswith('test/cpp/end2end:server_crash_test')
    ]

    # test never existed under build.yaml and it fails -> skip it
    tests = [
        test for test in tests
        if not test.startswith('test/core/tsi:ssl_session_cache_test')
    ]

    # the binary of this test does not get built with cmake
    tests = [
        test for test in tests
        if not test.startswith('test/cpp/util:channelz_sampler_test')
    ]

    # we don't need to generate fuzzers outside of bazel
    tests = [test for test in tests if not test.endswith('_fuzzer')]

    return tests


def _generate_build_extra_metadata_for_tests(
        tests: List[str], bazel_rules: BuildDict) -> BuildDict:
    """For given tests, generate the "extra metadata" that we need for our "build.yaml"-like output. The extra metadata is generated from the bazel rule metadata by using a bunch of heuristics."""
    test_metadata = {}
    for test in tests:
        test_dict = {'build': 'test', '_TYPE': 'target'}

        bazel_rule = bazel_rules[_get_bazel_label(test)]

        bazel_tags = bazel_rule['tags']
        if 'manual' in bazel_tags:
            # don't run the tests marked as "manual"
            test_dict['run'] = False

        if bazel_rule['flaky']:
            # don't run tests that are marked as "flaky" under bazel
            # because that would only add noise for the run_tests.py tests
            # and seeing more failures for tests that we already know are flaky
            # doesn't really help anything
            test_dict['run'] = False

        if 'no_uses_polling' in bazel_tags:
            test_dict['uses_polling'] = False

        if 'grpc_fuzzer' == bazel_rule['generator_function']:
            # currently we hand-list fuzzers instead of generating them automatically
            # because there's no way to obtain maxlen property from bazel BUILD file.
            print('skipping fuzzer ' + test)
            continue

        # if any tags that restrict platform compatibility are present,
        # generate the "platforms" field accordingly
        # TODO(jtattermusch): there is also a "no_linux" tag, but we cannot take
        # it into account as it is applied by grpc_cc_test when poller expansion
        # is made (for tests where uses_polling=True). So for now, we just
        # assume all tests are compatible with linux and ignore the "no_linux" tag
        # completely.
        known_platform_tags = set(['no_windows', 'no_mac'])
        if set(bazel_tags).intersection(known_platform_tags):
            platforms = []
            # assume all tests are compatible with linux and posix
            platforms.append('linux')
            platforms.append(
                'posix')  # there is no posix-specific tag in bazel BUILD
            if not 'no_mac' in bazel_tags:
                platforms.append('mac')
            if not 'no_windows' in bazel_tags:
                platforms.append('windows')
            test_dict['platforms'] = platforms

        cmdline_args = bazel_rule['args']
        if cmdline_args:
            test_dict['args'] = list(cmdline_args)

        if test.startswith('test/cpp'):
            test_dict['language'] = 'c++'

        elif test.startswith('test/core'):
            test_dict['language'] = 'c'
        else:
            raise Exception('wrong test' + test)

        # short test name without the path.
        # There can be name collisions, but we will resolve them later
        simple_test_name = os.path.basename(_extract_source_file_path(test))
        test_dict['_RENAME'] = simple_test_name

        test_metadata[test] = test_dict

    # detect duplicate test names
    tests_by_simple_name = {}
    for test_name, test_dict in test_metadata.items():
        simple_test_name = test_dict['_RENAME']
        if not simple_test_name in tests_by_simple_name:
            tests_by_simple_name[simple_test_name] = []
        tests_by_simple_name[simple_test_name].append(test_name)

    # choose alternative names for tests with a name collision
    for collision_list in tests_by_simple_name.values():
        if len(collision_list) > 1:
            for test_name in collision_list:
                long_name = test_name.replace('/', '_').replace(':', '_')
                print(
                    'short name of "%s" collides with another test, renaming to %s'
                    % (test_name, long_name))
                test_metadata[test_name]['_RENAME'] = long_name

    return test_metadata


def _detect_and_print_issues(build_yaml_like: BuildYaml) -> None:
    """Try detecting some unusual situations and warn about them."""
    for tgt in build_yaml_like['targets']:
        if tgt['build'] == 'test':
            for src in tgt['src']:
                if src.startswith('src/') and not src.endswith('.proto'):
                    print('source file from under "src/" tree used in test ' +
                          tgt['name'] + ': ' + src)


# extra metadata that will be used to construct build.yaml
# there are mostly extra properties that we weren't able to obtain from the bazel build
# _TYPE: whether this is library, target or test
# _RENAME: whether this target should be renamed to a different name (to match expectations of make and cmake builds)
_BUILD_EXTRA_METADATA = {
    'third_party/address_sorting:address_sorting': {
        'language': 'c',
        'build': 'all',
        '_RENAME': 'address_sorting'
    },
    'gpr': {
        'language': 'c',
        'build': 'all',
    },
    'grpc': {
        'language': 'c',
        'build': 'all',
        'baselib': True,
        'generate_plugin_registry': True
    },
    'grpc++': {
        'language': 'c++',
        'build': 'all',
        'baselib': True,
    },
    'grpc++_alts': {
        'language': 'c++',
        'build': 'all',
        'baselib': True
    },
    'grpc++_error_details': {
        'language': 'c++',
        'build': 'all'
    },
    'grpc++_reflection': {
        'language': 'c++',
        'build': 'all'
    },
    'grpc++_unsecure': {
        'language': 'c++',
        'build': 'all',
        'baselib': True,
    },
    # TODO(jtattermusch): do we need to set grpc_csharp_ext's LDFLAGS for wrapping memcpy in the same way as in build.yaml?
    'grpc_csharp_ext': {
        'language': 'c',
        'build': 'all',
    },
    'grpc_unsecure': {
        'language': 'c',
        'build': 'all',
        'baselib': True,
        'generate_plugin_registry': True
    },
    'grpcpp_channelz': {
        'language': 'c++',
        'build': 'all'
    },
    'grpc++_test': {
        'language': 'c++',
        'build': 'private',
    },
    'src/compiler:grpc_plugin_support': {
        'language': 'c++',
        'build': 'protoc',
        '_RENAME': 'grpc_plugin_support'
    },
    'src/compiler:grpc_cpp_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_cpp_plugin'
    },
    'src/compiler:grpc_csharp_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_csharp_plugin'
    },
    'src/compiler:grpc_node_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_node_plugin'
    },
    'src/compiler:grpc_objective_c_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_objective_c_plugin'
    },
    'src/compiler:grpc_php_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_php_plugin'
    },
    'src/compiler:grpc_python_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_python_plugin'
    },
    'src/compiler:grpc_ruby_plugin': {
        'language': 'c++',
        'build': 'protoc',
        '_TYPE': 'target',
        '_RENAME': 'grpc_ruby_plugin'
    },

    # TODO(jtattermusch): consider adding grpc++_core_stats

    # test support libraries
    'test/core/util:grpc_test_util': {
        'language': 'c',
        'build': 'private',
        '_RENAME': 'grpc_test_util'
    },
    'test/core/util:grpc_test_util_unsecure': {
        'language': 'c',
        'build': 'private',
        '_RENAME': 'grpc_test_util_unsecure'
    },
    # TODO(jtattermusch): consider adding grpc++_test_util_unsecure - it doesn't seem to be used by bazel build (don't forget to set secure: False)
    'test/cpp/util:test_config': {
        'language': 'c++',
        'build': 'private',
        '_RENAME': 'grpc++_test_config'
    },
    'test/cpp/util:test_util': {
        'language': 'c++',
        'build': 'private',
        '_RENAME': 'grpc++_test_util'
    },

    # end2end test support libraries
    'test/core/end2end:end2end_tests': {
        'language': 'c',
        'build': 'private',
        '_RENAME': 'end2end_tests'
    },
    'test/core/end2end:end2end_nosec_tests': {
        'language': 'c',
        'build': 'private',
        '_RENAME': 'end2end_nosec_tests'
    },

    # benchmark support libraries
    'test/cpp/microbenchmarks:helpers': {
        'language': 'c++',
        'build': 'test',
        'defaults': 'benchmark',
        '_RENAME': 'benchmark_helpers'
    },
    'test/cpp/interop:interop_client': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'interop_client'
    },
    'test/cpp/interop:interop_server': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'interop_server'
    },
    'test/cpp/interop:xds_interop_client': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'xds_interop_client'
    },
    'test/cpp/interop:xds_interop_server': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'xds_interop_server'
    },
    'test/cpp/interop:http2_client': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'http2_client'
    },
    'test/cpp/qps:qps_json_driver': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'qps_json_driver'
    },
    'test/cpp/qps:qps_worker': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'qps_worker'
    },
    'test/cpp/util:grpc_cli': {
        'language': 'c++',
        'build': 'test',
        'run': False,
        '_TYPE': 'target',
        '_RENAME': 'grpc_cli'
    },

    # TODO(jtattermusch): create_jwt and verify_jwt breaks distribtests because it depends on grpc_test_utils and thus requires tests to be built
    # For now it's ok to disable them as these binaries aren't very useful anyway.
    #'test/core/security:create_jwt': { 'language': 'c', 'build': 'tool', '_TYPE': 'target', '_RENAME': 'grpc_create_jwt' },
    #'test/core/security:verify_jwt': { 'language': 'c', 'build': 'tool', '_TYPE': 'target', '_RENAME': 'grpc_verify_jwt' },

    # TODO(jtattermusch): add remaining tools such as grpc_print_google_default_creds_token (they are not used by bazel build)

    # TODO(jtattermusch): these fuzzers had no build.yaml equivalent
    # test/core/compression:message_compress_fuzzer
    # test/core/compression:message_decompress_fuzzer
    # test/core/compression:stream_compression_fuzzer
    # test/core/compression:stream_decompression_fuzzer
    # test/core/slice:b64_decode_fuzzer
    # test/core/slice:b64_encode_fuzzer
}

# We need a complete picture of all the targets and dependencies we're interested in
# so we run multiple bazel queries and merge the results.
_BAZEL_DEPS_QUERIES = [
    'deps("//test/...")',
    'deps("//:all")',
    'deps("//src/compiler/...")',
    'deps("//src/proto/...")',
    # The ^ is needed to differentiate proto_library from go_proto_library
    'deps(kind("^proto_library", @envoy_api//envoy/...))',
]

# Step 1: run a bunch of "bazel query --output xml" queries to collect
# the raw build metadata from the bazel build.
# At the end of this step we will have a dictionary of bazel rules
# that are interesting to us (libraries, binaries, etc.) along
# with their most important metadata (sources, headers, dependencies)
#
# Example of a single bazel rule after being populated:
# '//:grpc' : { 'class': 'cc_library',
#               'hdrs': ['//:include/grpc/byte_buffer.h', ... ],
#               'srcs': ['//:src/core/lib/surface/init.cc', ... ],
#               'deps': ['//:grpc_common', ...],
#               ... }
bazel_rules = {}
for query in _BAZEL_DEPS_QUERIES:
    bazel_rules.update(
        _extract_rules_from_bazel_xml(_bazel_query_xml_tree(query)))

# Step 1.5: The sources for UPB protos are pre-generated, so we want
# to expand the UPB proto library bazel rules into the generated
# .upb.h and .upb.c files.
_expand_upb_proto_library_rules(bazel_rules)

# Step 2: Extract the known bazel cc_test tests. While most tests
# will be buildable with other build systems just fine, some of these tests
# would be too difficult to build and run with other build systems,
# so we simply exclude the ones we don't want.
# Note that while making tests buildable with other build systems
# than just bazel is extra effort, we still need to do that for these
# reasons:
# - If our cmake build doesn't have any tests at all, it's hard to make
#   sure that what it built actually works (we need at least some "smoke tests").
#   This is quite important because the build flags between bazel / non-bazel flag might differ
#   (sometimes it's for interesting reasons that are not easy to overcome)
#   which makes it even more important to have at least some tests for cmake/make
# - Our portability suite actually runs cmake tests and migration of portability
#   suite fully towards bazel might be intricate (e.g. it's unclear whether it's
#   possible to get a good enough coverage of different compilers / distros etc.
#   with bazel)
# - some things that are considered "tests" in build.yaml-based builds are actually binaries
#   we'd want to be able to build anyway (qps_json_worker, interop_client, interop_server, grpc_cli)
#   so it's unclear how much make/cmake simplification we would gain by removing just some (but not all) test
# TODO(jtattermusch): Investigate feasibility of running portability suite with bazel.
tests = _exclude_unwanted_cc_tests(_extract_cc_tests(bazel_rules))

# Step 3: Generate the "extra metadata" for all our build targets.
# While the bazel rules give us most of the information we need,
# the legacy "build.yaml" format requires some additional fields that
# we cannot get just from bazel alone (we call that "extra metadata").
# In this step, we basically analyze the build metadata we have from bazel
# and use heuristics to determine (and sometimes guess) the right
# extra metadata to use for each target.
#
# - For some targets (such as the public libraries, helper libraries
#   and executables) determining the right extra metadata is hard to do
#   automatically. For these targets, the extra metadata is supplied "manually"
#   in form of the _BUILD_EXTRA_METADATA dictionary. That allows us to match
#   the semantics of the legacy "build.yaml" as closely as possible.
#
# - For test binaries, it is possible to generate the "extra metadata" mostly
#   automatically using a rule-based heuristic approach because most tests
#   look and behave alike from the build's perspective.
#
# TODO(jtattermusch): Of course neither "_BUILD_EXTRA_METADATA" or
# the heuristic approach used for tests are ideal and they cannot be made
# to cover all possible situations (and are tailored to work with the way
# the grpc build currently works), but the idea was to start with something
# reasonably simple that matches the "build.yaml"-like semantics as closely
# as possible (to avoid changing too many things at once) and gradually get
# rid of the legacy "build.yaml"-specific fields one by one. Once that is done,
# only very little "extra metadata" would be needed and/or it would be trivial
# to generate it automatically.
all_extra_metadata = {}
all_extra_metadata.update(_BUILD_EXTRA_METADATA)
all_extra_metadata.update(
    _generate_build_extra_metadata_for_tests(tests, bazel_rules))

# Step 4: Compute the build metadata that will be used in the final build.yaml.
# The final build metadata includes transitive dependencies, and sources/headers
# expanded without intermediate dependencies.
# Example:
# '//:grpc' : { ...,
#               '_TRANSITIVE_DEPS': ['//:gpr_base', ...],
#               '_COLLAPSED_DEPS': ['gpr', ...],
#               '_COLLAPSED_SRCS': [...],
#               '_COLLAPSED_PUBLIC_HEADERS': [...],
#               '_COLLAPSED_HEADERS': [...]
#             }
_populate_transitive_metadata(bazel_rules, all_extra_metadata.keys())

# Step 4a: Update the existing test metadata with the updated build metadata.
# Certain build metadata of certain test targets depend on the transitive
# metadata that wasn't available earlier.
update_test_metadata_with_transitive_metadata(all_extra_metadata, bazel_rules)

# Step 5: Generate the final metadata for all the targets.
# This is done by combining the bazel build metadata and the "extra metadata"
# we obtained in the previous step.
# In this step, we also perform some interesting massaging of the target metadata
# to end up with a result that is as similar to the legacy build.yaml data
# as possible.
# - Some targets get renamed (to match the legacy build.yaml target names)
# - Some intermediate libraries get elided ("expanded") to better match the set
#   of targets provided by the legacy build.yaml build
#
# Originally the target renaming was introduced to address these concerns:
# - avoid changing too many things at the same time and avoid people getting
#   confused by some well know targets suddenly being missing
# - Makefile/cmake and also language-specific generators rely on some build
#   targets being called exactly the way they they are. Some of our testing
#   scrips also invoke executables (e.g. "qps_json_driver") by their name.
# - The autogenerated test name from bazel includes the package path
#   (e.g. "test_cpp_TEST_NAME"). Without renaming, the target names would
#   end up pretty ugly (e.g. test_cpp_qps_qps_json_driver).
# TODO(jtattermusch): reevaluate the need for target renaming in the future.
#
# Example of a single generated target:
# 'grpc' : { 'language': 'c',
#            'public_headers': ['include/grpc/byte_buffer.h', ... ],
#            'headers': ['src/core/ext/filters/client_channel/client_channel.h', ... ],
#            'src': ['src/core/lib/surface/init.cc', ... ],
#            'deps': ['gpr', 'address_sorting', ...],
#            ... }
all_targets_dict = _generate_build_metadata(all_extra_metadata, bazel_rules)

# Step 6: convert the dictionary with all the targets to a dict that has
# the desired "build.yaml"-like layout.
# TODO(jtattermusch): We use the custom "build.yaml"-like layout because
# currently all other build systems use that format as their source of truth.
# In the future, we can get rid of this custom & legacy format entirely,
# but we would need to update the generators for other build systems
# at the same time.
#
# Layout of the result:
# { 'libs': { TARGET_DICT_FOR_LIB_XYZ, ... },
#   'targets': { TARGET_DICT_FOR_BIN_XYZ, ... },
#   'tests': { TARGET_DICT_FOR_TEST_XYZ, ...} }
build_yaml_like = _convert_to_build_yaml_like(all_targets_dict)

# detect and report some suspicious situations we've seen before
_detect_and_print_issues(build_yaml_like)

# Step 7: Store the build_autogenerated.yaml in a deterministic (=sorted)
# and cleaned-up form.
# A basic overview of the resulting "build.yaml"-like format is here:
# https://github.com/grpc/grpc/blob/master/templates/README.md
# TODO(jtattermusch): The "cleanup" function is taken from the legacy
# build system (which used build.yaml) and can be eventually removed.
build_yaml_string = build_cleaner.cleaned_build_yaml_dict_as_string(
    build_yaml_like)
with open('build_autogenerated.yaml', 'w') as file:
    file.write(build_yaml_string)