1# This Source Code Form is subject to the terms of the Mozilla Public
2# License, v. 2.0. If a copy of the MPL was not distributed with this
3# file, # You can obtain one at http://mozilla.org/MPL/2.0/.
4
5# Utility package for working with moz.yaml files.
6#
7# Requires `pyyaml` and `voluptuous`
8# (both are in-tree under third_party/python)
9
10from __future__ import absolute_import, print_function, unicode_literals
11
12
13"""
14Problem:
15    ./mach vendor needs to be able to add or remove files from moz.build files automatically to
16    be able to effectively update a library automatically and send useful try runs in.
17
18    So far, it has been difficult to do that.
19    Why:
20        Some files need to go into UNIFIED_SOURCES vs SOURCES
21        Some files are os-specific, and need to go into per-OS conditionals
22        Some files are both UNIFIED_SOURCES/SOURCES sensitive and OS-specific.
23
24Proposal:
25    Design an algorithm that maps a third party library file to a suspected moz.build location.
26    Run the algorithm on all files specified in all third party libraries' moz.build files.
27    See if the proposed place in the moz.build file matches the actual place.
28
29Initial Algorithm
30    Given a file, which includes the filename and the path from gecko root, we want to find the
31        correct moz.build file and location within that file.
32    Take the path of the file, and iterate up the directory tree, looking for moz.build files as
33    we go.
34    Consider each of these moz.build files, starting with the one closest to the file.
35    Within a moz.build file, identify the SOURCES or UNIFIED_SOURCES block(s) that contains a file
36        in the same directory path as the file to be added.
37    If there is only one such block, use that one.
38    If there are multiple blocks, look at the files within each block and note the longest length
39        of a common prefix (including partial filenames - if we just did full directories the
40        result would be the same as the prior step and we would not narrow the results down). Use
41        the block containing the longest prefix. (We call this 'guessing'.)
42
43Result of the proposal:
44    The initial implementation works on 1675 of 1977 elligible files.
45    The files it does not work on include:
46        - general failures. Such as when we find that avutil.cpp wants to be next to adler32.cpp
47          but avutil.cpp is in SOURCES and adler32.cpp is in UNIFIED_SOURCES. (And many similar
48          cases.)
49        - per-cpu-feature files, where only a single file is added under a conditional
50        - When guessing, because of a len(...) > longest_so_far comparison, we would prefer the
51          first block we found.
52          -  Changing this to prefer UNIFIED_SOURCES in the event of a tie
53             yielded 17 additional correct assignments (about a 1% improvement)
54        - As a result of the change immediately above, when guessing, because given equal
55          prefixes, we would prefer a UNIFIED_SOURCES block over other blocks, even if the other
56          blocks are longer
57          - Changing this (again) to prefer the block containing more files yielded 49 additional
58            correct assignments (about a 2.5% improvement)
59
60    The files that are ineligible for consideration are:
61        - Those in libwebrtc
62        - Those specified in source assignments composed of generators (e.g. [f for f in '%.c'])
63        - Those specified in source assignments to subscripted variables
64          (e.g. SOURCES += foo['x86_files'])
65    We needed to iterate up the directory and look at a different moz.build file _zero_ times.
66        This indicates this code is probably not needed, and therefore we will remove it from the
67        algorithm.
68    We needed to guess base on the longest prefix 944 times, indicating that this code is
69        absolutely crucial and should be double-checked. (And indeed, upon double-checking it,
70        bugs were identified.)
71
72    After some initial testing, it was determined that this code completely fell down when the
73    vendoring directory differed from the moz.yaml directory (definitions below.) The code was
74    slightly refactored to handle this case, primarily by (a) re-inserting the logic to check
75    multiple moz.build files instead of the first and (b) handling some complicated normalization
76    notions (details in comments).
77
78Slightly Improved Algorithm Changes:
79    Don't bother iterating up the directory tree looking for moz.build files, just take the first.
80    When guessing, in the event of a common-prefix tie, prefer the block containing more files
81
82    With these changes, we now Successfully Matched 1724 of 1977 files
83
84CODE CONCEPTS
85
86source-assignment
87    An assignment of files to a SOURCES or UNIFIED_SOURCES variable, such as
88    SOURCES += ['ffpvx.cpp']
89
90    We specifically look only for these two variable names to avoid identifying things
91    such as CXX_FLAGS.
92
93    Sometimes; however, there is an intermediary variable, such as SOURCES += celt_filenames
94    In this situation we find the celt_filenames assignment, and treat it as a 'source-assignment'
95
96source-assignment-location
97    source-assignment-location is a human readable string that identifies where in the moz.build
98    file the source-assignment is. It can used to visually match the location upon manual
99    inspection; and given a source-assignment-location, re-identify it when iterating over all
100    source-assignments in a file.
101
102    The actual string consists of the path from the root of the moz.build file to the
103    source-assignment, plus a suffix number.
104
105    We suffix the final value with an incrementing counter. This is to support moz.build files
106    that, for whatever reason, use multiple SOURCES += [] list in the same basic block. This index
107    is per-file, so no two assignments in the same file (even if they have separate locations)
108    should have the same suffix.
109
110    For example:
111
112    When SOURCES += ['ffpvx.xpp'] appears as the first line of the file (or any other
113    unindented-location) its source-assignment-location will be "> SOURCES 1".
114
115    When SOURCES += ['ffpvx.xpp'] appears inside a conditional such as
116    `CONFIG['OS_TARGET'] == 'WINNT'` then its source-assignment-location will be
117    "> if CONFIG['OS_TARGET'] == 'WINNT' > SOURCES 1"
118
119    When SOURCES += ['ffpvx.xpp'] appears as the second line of the file, and a different
120    SOURCES += [] was the first line, then its source-assignment-location will be "> SOURCES 2".
121
122    No two source-assignments may have the same source-assignment-location. If they do, we raise
123    an assert.
124
125file vs filename
126    a 'filename' is a string specifing the name and sometimes the path of a file.
127    a 'file' is an object you get from open()-ing a filename
128
129    A variable that is a string should always use 'filename'
130
131vendoring directory vs moz.yaml directory
132    In many cases, a library's moz.yaml file, moz.build file(s), and sources files will all live
133    under a single directory. e.g. libjpeg
134
135    In other cases, a library's source files are in one directory (we call this the 'vendoring
136    directory') and the moz.yaml file and moz.build file(s) are in another directory (we call this
137    the moz.yaml directory).  e.g. libdav1d
138
139normalized-filename
140    A filename is 'normalized' if it has been expanded to the full path from the gecko root. This
141    requires a moz.build file.
142
143    For example a filename `lib/opus.c` may be specified inside the `media/libopus/moz.build`
144    file. The filename is normalized by os.path.join()-ing the dirname of the moz.build file
145    (i.e. `media/libopus`) to the filename, resulting in `media/libopus/lib/opus.c`
146
147    A filename that begins with '/' is presumed to already be specified relative to the gecko
148    root, and therefore is not modified.
149
150    Normalization gets more complicated when dealing with separate vendoring and moz.yaml
151    directories. This is because a file can be considered normalized when it looks like
152       third_party/libdav1d/src/a.cpp
153    _or_ when it looks like
154       media/libdav1d/../../third_party/libdav1d/src/a.cpp
155    This is because in the moz.build file, it will be specified as
156    `../../third_party/libdav1d/src/a.cpp` and we 'normalize' it by prepending the path to the
157    moz.build file.
158
159    Normalization is not just about having an 'absolute' path from gecko_root to file. In fact
160    it's not really about that at all - it's about matching filenames. Therefore when we are
161    dealing with separate vendoring and moz.yaml directories we will very quickly 're-normalize'
162    a normalized filename to get it into one of those foo/bar/../../third_party/... paths that
163    will make sense for the moz.build file we are interested in.
164
165    Whenever a filename is normalized, it should be specified as such in the variable name,
166    either as a prefix (normalized_filename) or a suffix (target_filename_normalized)
167
168statistic_
169    Using some hacky stuff, we report statistics about how many times we hit certain branches of
170    the code.
171    e.g.
172        "How many times did we refine a guess based on prefix length"
173        "How many times did we refine a guess based on the number of files in the block"
174        "What is the histogram of guess candidates"
175
176    We do this to identify how frequently certain code paths were taken, allowing us to identify
177    strange behavior and investigate outliers. This process lead to identifying bugs and small
178    improvements.
179"""
180
181import os
182import re
183import ast
184import sys
185import copy
186import fileinput
187import subprocess
188
189from pprint import pprint
190from mozbuild.frontend.sandbox import alphabetical_sorted
191
192statistics = {
193    "guess_candidates": {},
194    "number_refinements": {},
195    "needed_to_guess": 0,
196    "length_logic": {},
197}
198
199
200def log(*args, **kwargs):
201    # If is helpful to keep some logging statements around, but we don't want to print them
202    #  unless we are debugging
203    # print(*args, **kwargs)
204    pass
205
206
207# < python 3.8 shims #########################
208# Taskcluster currently runs python 3.5 and it's difficult to move to a more recent one
209# Once Tskcl moves to 3.8 we could try to remove this. (Or keep it for developers who are < 3.8)
210# From https://github.com/python/cpython/blob/44f6b9aa49d562ab7c67952442b8348346b24141/Lib/ast.py
211
212
213def _splitlines_no_ff(source):
214    """Split a string into lines ignoring form feed and other chars.
215    This mimics how the Python parser splits source code.
216    """
217    idx = 0
218    lines = []
219    next_line = ""
220    while idx < len(source):
221        c = source[idx]
222        next_line += c
223        idx += 1
224        # Keep \r\n together
225        if c == "\r" and idx < len(source) and source[idx] == "\n":
226            next_line += "\n"
227            idx += 1
228        if c in "\r\n":
229            lines.append(next_line)
230            next_line = ""
231
232    if next_line:
233        lines.append(next_line)
234    return lines
235
236
237def _pad_whitespace(source):
238    r"""Replace all chars except '\f\t' in a line with spaces."""
239    result = ""
240    for c in source:
241        if c in "\f\t":
242            result += c
243        else:
244            result += " "
245    return result
246
247
248def ast_get_source_segment(source, node):
249    """Get source code segment of the *source* that generated *node*.
250    If some location information (`lineno`, `end_lineno`, `col_offset`,
251    or `end_col_offset`) is missing, return None.
252    If *padded* is `True`, the first line of a multi-line statement will
253    be padded with spaces to match its original position.
254    """
255    try:
256        lineno = node.lineno - 1
257        end_lineno = node.end_lineno - 1
258        col_offset = node.col_offset
259        end_col_offset = node.end_col_offset
260    except AttributeError:
261        return None
262
263    lines = _splitlines_no_ff(source)
264    if end_lineno == lineno:
265        return lines[lineno].encode()[col_offset:end_col_offset].decode()
266
267    if padded:
268        padding = _pad_whitespace(lines[lineno].encode()[:col_offset].decode())
269    else:
270        padding = ""
271
272    first = padding + lines[lineno].encode()[col_offset:].decode()
273    last = lines[end_lineno].encode()[:end_col_offset].decode()
274    lines = lines[lineno + 1 : end_lineno]
275
276    lines.insert(0, first)
277    lines.append(last)
278    return "".join(lines)
279
280
281##############################################
282
283
284def node_to_readable_file_location(code, node, child_node=None):
285    location = ""
286
287    if isinstance(node.parent, ast.Module):
288        # The next node up is the root, don't go higher.
289        pass
290    else:
291        location += node_to_readable_file_location(code, node.parent, node)
292
293    location += " > "
294    if isinstance(node, ast.Module):
295        raise Exception("We shouldn't see a Module")
296    elif isinstance(node, ast.If):
297        assert child_node
298        if child_node in node.body:
299            location += "if " + ast_get_source_segment(code, node.test)
300        else:
301            location += "else-of-if " + ast_get_source_segment(code, node.test)
302    elif isinstance(node, ast.For):
303        location += (
304            "for "
305            + ast_get_source_segment(code, node.target)
306            + " in "
307            + ast_get_source_segment(code, node.iter)
308        )
309    elif isinstance(node, ast.AugAssign):
310        if isinstance(node.target, ast.Name):
311            location += node.target.id
312        else:
313            location += ast_get_source_segment(code, node.target)
314    elif isinstance(node, ast.Assign):
315        # This assert would fire if we did e.g. some_sources = all_sources = [ ... ]
316        assert len(node.targets) == 1, "Assignment node contains more than one target"
317        if isinstance(node.targets[0], ast.Name):
318            location += node.targets[0].id
319        else:
320            location += ast_get_source_segment(code, node.targets[0])
321    else:
322        raise Exception("Got a node type I don't know how to handle: " + str(node))
323
324    return location
325
326
327def assignment_node_to_source_filename_list(code, node):
328    """
329    If the list of filenames is not a list of constants (e.g. it's a generated list)
330    it's (probably) infeasible to try and figure it out. At least we're not going to try
331    right now. Maybe in the future?
332
333    If this happens, we'll return an empty list. The consequence of this is that we
334    won't be able to match a file against this list, so we may not be able to add it.
335
336    (But if the file matches a generated list, perhaps it will be included in the
337    Sources list automatically?)
338    """
339    if isinstance(node.value, ast.List) and "elts" in node.value._fields:
340        for f in node.value.elts:
341            if not isinstance(f, ast.Constant):
342                log(
343                    "Found non-constant source file name in list: ",
344                    ast_get_source_segment(code, f),
345                )
346                return []
347        return [f.value for f in node.value.elts]
348    elif isinstance(node.value, ast.ListComp):
349        # SOURCES += [f for f in foo if blah]
350        log("Could not find the files for " + ast_get_source_segment(code, node.value))
351    elif isinstance(node.value, ast.Name) or isinstance(node.value, ast.Subscript):
352        # SOURCES += other_var
353        # SOURCES += files['X64_SOURCES']
354        log("Could not find the files for " + ast_get_source_segment(code, node))
355    elif isinstance(node.value, ast.Call):
356        # SOURCES += sorted(...)
357        log("Could not find the files for " + ast_get_source_segment(code, node))
358    else:
359        raise Exception(
360            "Unexpected node received in assignment_node_to_source_filename_list: "
361            + str(node)
362        )
363    return []
364
365
366def mozbuild_file_to_source_assignments(normalized_mozbuild_filename):
367    """
368    Returns a dictionary of 'source-assignment-location' -> 'normalized source filename list'
369    contained in the moz.build file specified
370
371    normalized_mozbuild_filename: the moz.build file to read
372    """
373    source_assignments = {}
374
375    # Parse the AST of the moz.build file
376    code = open(normalized_mozbuild_filename).read()
377    root = ast.parse(code)
378
379    # Populate node parents. This allows us to walk up from a node to the root.
380    # (Really I think python's ast class should do this, but it doesn't, so we monkey-patch it)
381    for node in ast.walk(root):
382        for child in ast.iter_child_nodes(node):
383            child.parent = node
384
385    # Find all the assignments of SOURCES or UNIFIED_SOURCES
386    source_assignment_nodes = [
387        node
388        for node in ast.walk(root)
389        if isinstance(node, ast.AugAssign)
390        and isinstance(node.target, ast.Name)
391        and node.target.id in ["SOURCES", "UNIFIED_SOURCES"]
392    ]
393    assert (
394        len([n for n in source_assignment_nodes if not isinstance(n.op, ast.Add)]) == 0
395    ), "We got a Source assignment that wasn't +="
396
397    # Recurse and find nodes where we do SOURCES += other_var or SOURCES += FILES['foo']
398    recursive_assignment_nodes = [
399        node
400        for node in source_assignment_nodes
401        if isinstance(node.value, ast.Name) or isinstance(node.value, ast.Subscript)
402    ]
403
404    recursive_assignment_nodes_names = [
405        node.value.id
406        for node in recursive_assignment_nodes
407        if isinstance(node.value, ast.Name)
408    ]
409
410    # TODO: We do not dig into subscript variables. These are currently only used by two libraries
411    #       that use external sources.mozbuild files.
412    # recursive_assignment_nodes_names.extend([something<node> for node in
413    #    recursive_assignment_nodes if isinstance(node.value, ast.Subscript)]
414
415    additional_assignment_nodes = [
416        node
417        for node in ast.walk(root)
418        if isinstance(node, ast.Assign)
419        and isinstance(node.targets[0], ast.Name)
420        and node.targets[0].id in recursive_assignment_nodes_names
421    ]
422
423    # Remove the original, useless assignment node (the SOURCES += other_var)
424    for node in recursive_assignment_nodes:
425        source_assignment_nodes.remove(node)
426    # Add the other_var += [''] source-assignment
427    source_assignment_nodes.extend(additional_assignment_nodes)
428
429    # Get the source-assignment-location for the node:
430    assignment_index = 1
431    for a in source_assignment_nodes:
432        source_assignment_location = (
433            node_to_readable_file_location(code, a) + " " + str(assignment_index)
434        )
435        source_filename_list = assignment_node_to_source_filename_list(code, a)
436
437        if not source_filename_list:
438            # In some cases (like generated source file lists) we will have an empty list.
439            # If that is the case, just omit the source assignment
440            continue
441
442        normalized_source_filename_list = [
443            normalize_filename(normalized_mozbuild_filename, f)
444            for f in source_filename_list
445        ]
446
447        if source_assignment_location in source_assignments:
448            source_assignment_location = node_to_readable_file_location(code, a)
449
450        assert (
451            source_assignment_location not in source_assignments
452        ), "In %s, two assignments have the same key ('%s')" % (
453            normalized_mozbuild_filename,
454            source_assignment_location,
455        )
456        source_assignments[source_assignment_location] = normalized_source_filename_list
457        assignment_index += 1
458
459    return (source_assignments, root, code)
460
461
462def unnormalize_filename(normalized_mozbuild_filename, normalized_filename):
463    if normalized_filename[0] == "/":
464        return normalized_filename
465
466    mozbuild_path = os.path.dirname(normalized_mozbuild_filename) + "/"
467    return normalized_filename.replace(mozbuild_path, "")
468
469
470def normalize_filename(normalized_mozbuild_filename, filename):
471    if filename[0] == "/":
472        return filename
473
474    mozbuild_path = os.path.dirname(normalized_mozbuild_filename)
475    return os.path.join(mozbuild_path, filename)
476
477
478def get_mozbuild_file_search_order(
479    normalized_filename,
480    moz_yaml_dir=None,
481    vendoring_dir=None,
482    all_mozbuild_filenames_normalized=None,
483):
484    """
485    Returns an ordered list of normalized moz.build filenames to consider for a given filename
486
487    normalized_filename: a source filename normalized to the gecko root
488    moz_yaml_dir: the path from gecko_root to the moz.yaml file (which is the root of the
489                  moz.build files)
490    moz_yaml_dir: the path to where the library's source files are
491    all_mozbuild_filenames_normalized: (optional) the list of all third-party moz.build files
492
493    If all_mozbuild_filenames_normalized is not specified, we look in the filesystem.
494
495    The list is built out of two distinct steps.
496
497    In Step 1 we will walk up a directory tree, looking for moz.build files. We append moz.build
498    files in this order, preferring the lowest moz.build we find, then moving on to one in a
499    higher directory.
500    The directory we start in is a little complicated. We take the series of subdirectories
501    between vendoring_dir and the file in question, and then append them to the moz.yaml
502    directory.
503
504    Example:
505        When moz_yaml directory != vendoring_directory:
506            moz_yaml_dir = foo/bar/
507            vendoring_dir = third_party/baz/
508            normalized_filename = third_party/baz/asm/arm/a.S
509            starting_directory: foo/bar/asm/arm/
510        When moz_yaml directory == vendoring_directory
511            (In this case, these variables will actually be 'None' but the algorthm is the same)
512            moz_yaml_dir = foo/bar/
513            vendoring_dir = foo/bar/
514            normalized_filename = foo/bar/asm/arm/a.S
515            starting_directory: foo/bar/asm/arm/
516
517    In Step 2 we get a bit desparate. When the vendoring directory and the moz_yaml directory are
518    not the same, there is no guarentee that the moz_yaml directory will adhere to the same
519    directory structure as the vendoring directory.  And indeed it doesn't in some cases
520    (e.g. libdav1d.)
521    So in this situation we start at the root of the moz_yaml directory and walk downwards, adding
522    _any_ moz.build file we encounter to the list. Later on (in all cases, not just
523    moz_yaml_dir != vendoring_dir) we only consider a moz.build file if it has source files whose
524    directory matches the normalized_filename, so this step, though desparate, is safe-ish and
525    believe it or not has worked for some file additions.
526    """
527    ordered_list = []
528
529    if all_mozbuild_filenames_normalized is None:
530        assert os.path.isfile(
531            ".arcconfig"
532        ), "We do not seem to be running from the gecko root"
533
534    # The first time around, this variable name is incorrect.
535    #    It's actually the full path+filename, not a directory.
536    test_directory = None
537    if (moz_yaml_dir, vendoring_dir) == (None, None):
538        # In this situation, the library is vendored into the same directory as
539        # the moz.build files. We can start traversing directories up from the file to
540        # add to find the correct moz.build file
541        test_directory = normalized_filename
542    elif moz_yaml_dir and vendoring_dir:
543        # In this situation, the library is vendored in a different place (typically
544        # third_party/foo) from the moz.build files.
545        subdirectory_path = normalized_filename.replace(vendoring_dir, "")
546        test_directory = os.path.join(moz_yaml_dir, subdirectory_path)
547    else:
548        raise Exception("If moz_yaml_dir or vendoring_dir are specified, both must be")
549
550    # Step 1
551    while len(os.path.dirname(test_directory)) > 1:  # While we are not at '/'
552        containing_directory = os.path.dirname(test_directory)
553
554        possible_normalized_mozbuild_filename = os.path.join(
555            containing_directory, "moz.build"
556        )
557
558        if not all_mozbuild_filenames_normalized:
559            if os.path.isfile(possible_normalized_mozbuild_filename):
560                ordered_list.append(possible_normalized_mozbuild_filename)
561        elif possible_normalized_mozbuild_filename in all_mozbuild_filenames_normalized:
562            ordered_list.append(possible_normalized_mozbuild_filename)
563
564        test_directory = containing_directory
565
566    # Step 2
567    if moz_yaml_dir:
568        for root, dirs, files in os.walk(moz_yaml_dir):
569            for f in files:
570                if f == "moz.build":
571                    ordered_list.append(os.path.join(root, f))
572
573    return ordered_list
574
575
576def get_closest_mozbuild_file(
577    normalized_filename,
578    moz_yaml_dir=None,
579    vendoring_dir=None,
580    all_mozbuild_filenames_normalized=None,
581):
582    """
583    Returns the closest moz.build file in the directory tree to a normalized filename
584    """
585    r = get_mozbuild_file_search_order(
586        normalized_filename,
587        moz_yaml_dir,
588        vendoring_dir,
589        all_mozbuild_filenames_normalized,
590    )
591    return r[0] if r else None
592
593
594def filenames_directory_is_in_filename_list(
595    filename_normalized, list_of_normalized_filenames
596):
597    """
598    Given a normalized filename and a list of normalized filenames, first turn them into a
599    containing directory, and a list of containing directories. Then test if the containing
600    directory of the filename is in the list.
601
602    ex:
603        f = filenames_directory_is_in_filename_list
604        f("foo/bar/a.c", ["foo/b.c"]) -> false
605        f("foo/bar/a.c", ["foo/b.c", "foo/bar/c.c"]) -> true
606        f("foo/bar/a.c", ["foo/b.c", "foo/bar/baz/d.c"]) -> false
607    """
608    path_list = set([os.path.dirname(f) for f in list_of_normalized_filenames])
609    return os.path.dirname(filename_normalized) in path_list
610
611
612def find_all_posible_assignments_from_filename(source_assignments, filename_normalized):
613    """
614    Given a list of source assignments and a normalized filename, narrow the list to assignments
615    that contain a file whose directory matches the filename's directory.
616    """
617    possible_assignments = {}
618    for key, list_of_normalized_filenames in source_assignments.items():
619        if not list_of_normalized_filenames:
620            continue
621        if filenames_directory_is_in_filename_list(
622            filename_normalized, list_of_normalized_filenames
623        ):
624            possible_assignments[key] = list_of_normalized_filenames
625    return possible_assignments
626
627
628def guess_best_assignment(source_assignments, filename_normalized):
629    """
630    Given several assignments, all of which contain the same directory as the filename, pick one
631    we think is best and return its source-assignment-location.
632
633    We do this by looking at the filename itself (not just its directory) and picking the
634    assignment which contains a filename with the longest matching prefix.
635
636    e.g: "foo/asm_neon.c" compared to ["foo/main.c", "foo/all_utility.c"], ["foo/asm_arm.c"]
637            -> ["foo/asm_arm.c"] (match of foo/asm_)
638    """
639    length_of_longest_match = 0
640    source_assignment_location_of_longest_match = None
641    statistic_number_refinements = 0
642    statistic_length_logic = 0
643
644    for key, list_of_normalized_filenames in source_assignments.items():
645        for f in list_of_normalized_filenames:
646            if filename_normalized == f:
647                # Do not cheat by matching the prefix of the exact file
648                continue
649
650            prefix = os.path.commonprefix([filename_normalized, f])
651            if len(prefix) > length_of_longest_match:
652                statistic_number_refinements += 1
653                length_of_longest_match = len(prefix)
654                source_assignment_location_of_longest_match = key
655            elif len(prefix) == length_of_longest_match and len(
656                source_assignments[key]
657            ) > len(source_assignments[source_assignment_location_of_longest_match]):
658                statistic_number_refinements += 1
659                statistic_length_logic += 1
660                length_of_longest_match = len(prefix)
661                source_assignment_location_of_longest_match = key
662    return (
663        source_assignment_location_of_longest_match,
664        (statistic_number_refinements, statistic_length_logic),
665    )
666
667
668def edit_moz_build_file_to_add_file(
669    normalized_mozbuild_filename,
670    unnormalized_filename_to_add,
671    unnormalized_list_of_files,
672):
673    """
674    This function edits the moz.build file in-place
675
676    I had _really_ hoped to replace this whole damn thing with something that adds a
677    node to the AST, dumps the AST out, and then runs black on the file but there are
678    some issues:
679      - third party moz.build files (or maybe all moz.build files) aren't always run
680        through black
681      - dumping the ast out losing comments
682    """
683
684    # add the file into the list, and then sort it in the same way the moz.build validator
685    # expects
686    unnormalized_list_of_files.append(unnormalized_filename_to_add)
687    unnormalized_list_of_files = alphabetical_sorted(unnormalized_list_of_files)
688
689    # we're going to add our file by doing a find/replace of an adjacent file in the list
690    indx_of_addition = unnormalized_list_of_files.index(unnormalized_filename_to_add)
691    indx_of_addition
692    if indx_of_addition == 0:
693        target_indx = 1
694        replace_before = False
695    else:
696        target_indx = indx_of_addition - 1
697        replace_before = True
698
699    find_str = unnormalized_list_of_files[target_indx]
700
701    # We will only perform the first replacement. This is because sometimes there's moz.build
702    # code like:
703    #   SOURCES += ['file.cpp']
704    #   SOURCES['file.cpp'].flags += ['-Winline']
705    # If we replaced every time we found the target, we would be inserting into that second
706    # line.
707    did_replace = False
708
709    # FileInput is a strange class that lets you edit a file in-place, but does so by hijacking
710    # stdout, so you just print() the output you want as you go through
711    file = fileinput.FileInput(normalized_mozbuild_filename, inplace=True)
712    for line in file:
713        if not did_replace and find_str in line:
714            did_replace = True
715
716            # Okay, we found the line we need to edit, now we need to be ugly about it
717            # Grab the type of quote used in this moz.build file: single or double
718            quote_type = line[line.index(find_str) - 1]
719
720            if "[" not in line:
721                # We'll want to put our new file onto its own line
722                newline_to_add = "\n"
723                # And copy the indentation of the line we're adding adjacent to
724                indent_value = line[0 : line.index(quote_type)]
725            else:
726                # This is frustrating, we have the start of the array here. We aren't
727                # going to be able to indent things onto a newline properly. We're just
728                # going to have to stick it in on the same line.
729                newline_to_add = ""
730                indent_value = ""
731
732            find_str = "%s%s%s" % (quote_type, find_str, quote_type)
733            if replace_before:
734                replacement_tuple = (
735                    find_str,
736                    newline_to_add,
737                    indent_value,
738                    quote_type,
739                    unnormalized_filename_to_add,
740                    quote_type,
741                )
742                replace_str = "%s,%s%s%s%s%s" % replacement_tuple
743            else:
744                replacement_tuple = (
745                    quote_type,
746                    unnormalized_filename_to_add,
747                    quote_type,
748                    newline_to_add,
749                    indent_value,
750                    find_str,
751                )
752                replace_str = "%s%s%s,%s%s%s" % replacement_tuple
753
754            line = line.replace(find_str, replace_str)
755
756        print(line, end="")  # line has its own newline on it, don't add a second
757    file.close()
758
759
760def edit_moz_build_file_to_remove_file(
761    normalized_mozbuild_filename,
762    unnormalized_filename_to_remove,
763):
764    """
765    This function edits the moz.build file in-place
766    """
767
768    simple_file_line = re.compile(
769        "^\s*['\"]" + unnormalized_filename_to_remove + "['\"],*$"
770    )
771    did_replace = False
772
773    # FileInput is a strange class that lets you edit a file in-place, but does so by hijacking
774    # stdout, so you just print() the output you want as you go through
775    file = fileinput.FileInput(normalized_mozbuild_filename, inplace=True)
776    for line in file:
777        if not did_replace and unnormalized_filename_to_remove in line:
778            did_replace = True
779
780            # If the line consists of just a single source file on it, then we're in the clear
781            # we can just skip this line.
782            if simple_file_line.match(line):
783                # Do not output anything, just keep going.
784                continue
785
786            # Okay, so the line is a little more complicated.
787            quote_type = line[line.index(unnormalized_filename_to_remove) - 1]
788
789            if "[" in line or "]" in line:
790                find_str = "%s%s%s,*" % (
791                    quote_type,
792                    unnormalized_filename_to_remove,
793                    quote_type,
794                )
795                line = re.sub(find_str, "", line)
796            else:
797                raise Exception(
798                    "Got an unusual type of line we're trying to remove a file from:",
799                    line,
800                )
801
802        print(line, end="")
803
804    file.close()
805
806
807def validate_directory_parameters(moz_yaml_dir, vendoring_dir):
808    # Validate the parameters
809    assert (moz_yaml_dir, vendoring_dir) == (None, None) or (
810        moz_yaml_dir and vendoring_dir
811    ), "If either moz_yaml_dir or vendoring_dir are specified, they both must be"
812
813    # Ensure they are provided with trailing slashes
814    moz_yaml_dir += "/" if moz_yaml_dir[-1] != "/" else ""
815    vendoring_dir += "/" if vendoring_dir[-1] != "/" else ""
816
817    return (moz_yaml_dir, vendoring_dir)
818
819
820#########################################################
821# PUBLIC API
822#########################################################
823
824
825def remove_file_from_moz_build_file(
826    normalized_filename_to_remove, moz_yaml_dir=None, vendoring_dir=None
827):
828    """
829    Given a filename, relative to the gecko root (aka normalized), we look for the nearest
830    moz.build file, look in that file for the file, and then edit that moz.build file in-place.
831    """
832    moz_yaml_dir, vendoring_dir = validate_directory_parameters(
833        moz_yaml_dir, vendoring_dir
834    )
835
836    all_possible_normalized_mozbuild_filenames = get_mozbuild_file_search_order(
837        normalized_filename_to_remove, moz_yaml_dir, vendoring_dir, None
838    )
839
840    # normalized_filename_to_remove is the path from gecko_root to the file. However, if we vendor
841    #    separate from moz.yaml; then 'normalization' gets more complicated as explained above.
842    # We will need to re-normalize the filename for each moz.build file we want to test, so we
843    #    save the original normalized filename for this purpose
844    original_normalized_filename_to_remove = normalized_filename_to_remove
845
846    for normalized_mozbuild_filename in all_possible_normalized_mozbuild_filenames:
847        if moz_yaml_dir and vendoring_dir:
848            # Here is where we re-normalize the filename. For the rest of the algorithm, we
849            #    will be using this re-normalized filename.
850            # To re-normalize it, we:
851            #   (a) get the path from gecko_root to the moz.build file we are considering
852            #   (b) compute a relative path from that directory to the file we want
853            #   (c) because (b) started at the moz.build file's directory, it is not
854            #       normalized to the gecko_root. Therefore we need to normalize it by
855            #       prepending (a)
856            a = os.path.dirname(normalized_mozbuild_filename)
857            b = os.path.relpath(normalized_filename_to_remove, start=a)
858            c = os.path.join(a, b)
859            normalized_filename_to_remove = c
860
861        source_assignments, root, code = mozbuild_file_to_source_assignments(
862            normalized_mozbuild_filename
863        )
864
865        for key in source_assignments:
866            normalized_source_filename_list = source_assignments[key]
867            if normalized_filename_to_remove in normalized_source_filename_list:
868                unnormalized_filename_to_remove = unnormalize_filename(
869                    normalized_mozbuild_filename, normalized_filename_to_remove
870                )
871                edit_moz_build_file_to_remove_file(
872                    normalized_mozbuild_filename, unnormalized_filename_to_remove
873                )
874                return
875
876        normalized_filename_to_remove = original_normalized_filename_to_remove
877    raise Exception("Could not remove file")
878
879
880def add_file_to_moz_build_file(
881    normalized_filename_to_add, moz_yaml_dir=None, vendoring_dir=None
882):
883    """
884    This is the overall function. Given a filename, relative to the gecko root (aka normalized),
885    we look for a moz.build file to add it to, look for the place in the moz.build file to add it,
886    and then edit that moz.build file in-place.
887
888    It accepted two optional parameters. If one is specified they both must be. If a library is
889    vendored in a separate place from the moz.yaml file, these parameters specify those two
890    directories.
891    """
892    moz_yaml_dir, vendoring_dir = validate_directory_parameters(
893        moz_yaml_dir, vendoring_dir
894    )
895
896    all_possible_normalized_mozbuild_filenames = get_mozbuild_file_search_order(
897        normalized_filename_to_add, moz_yaml_dir, vendoring_dir, None
898    )
899
900    # normalized_filename_to_add is the path from gecko_root to the file. However, if we vendor
901    #    separate from moz.yaml; then 'normalization' gets more complicated as explained above.
902    # We will need to re-normalize the filename for each moz.build file we want to test, so we
903    #    save the original normalized filename for this purpose
904    original_normalized_filename_to_add = normalized_filename_to_add
905
906    for normalized_mozbuild_filename in all_possible_normalized_mozbuild_filenames:
907        if moz_yaml_dir and vendoring_dir:
908            # Here is where we re-normalize the filename. For the rest of the algorithm, we
909            #    will be using this re-normalized filename.
910            # To re-normalize it, we:
911            #   (a) get the path from gecko_root to the moz.build file we are considering
912            #   (b) compute a relative path from that directory to the file we want
913            #   (c) because (b) started at the moz.build file's directory, it is not
914            #       normalized to the gecko_root. Therefore we need to normalize it by
915            #       prepending (a)
916            a = os.path.dirname(normalized_mozbuild_filename)
917            b = os.path.relpath(normalized_filename_to_add, start=a)
918            c = os.path.join(a, b)
919            normalized_filename_to_add = c
920
921        source_assignments, root, code = mozbuild_file_to_source_assignments(
922            normalized_mozbuild_filename
923        )
924
925        possible_assignments = find_all_posible_assignments_from_filename(
926            source_assignments, normalized_filename_to_add
927        )
928
929        if len(possible_assignments) == 0:
930            normalized_filename_to_add = original_normalized_filename_to_add
931            continue
932
933        assert (
934            len(possible_assignments) > 0
935        ), "Could not find a single possible source assignment"
936        if len(possible_assignments) > 1:
937            best_guess, _ = guess_best_assignment(
938                possible_assignments, normalized_filename_to_add
939            )
940            chosen_source_assignment_location = best_guess
941        else:
942            chosen_source_assignment_location = list(possible_assignments.keys())[0]
943
944        guessed_list_containing_normalized_filenames = possible_assignments[
945            chosen_source_assignment_location
946        ]
947
948        # unnormalize filenames so we can edit the moz.build file. They rarely use full paths.
949        unnormalized_filename_to_add = unnormalize_filename(
950            normalized_mozbuild_filename, normalized_filename_to_add
951        )
952        unnormalized_list_of_files = [
953            unnormalize_filename(normalized_mozbuild_filename, f)
954            for f in guessed_list_containing_normalized_filenames
955        ]
956
957        edit_moz_build_file_to_add_file(
958            normalized_mozbuild_filename,
959            unnormalized_filename_to_add,
960            unnormalized_list_of_files,
961        )
962        return
963    assert False, "Could not find a single moz.build file to edit"
964
965
966#########################################################
967# TESTING CODE
968#########################################################
969
970
971def get_all_target_filenames_normalized(all_mozbuild_filenames_normalized):
972    """
973    Given a list of moz.build files, returns all the files listed in all the souce assignments
974    in the file.
975
976    This function is only used for debug/testing purposes - there is no reason to call this
977    as part of 'the algorithm'
978    """
979    all_target_filenames_normalized = []
980    for normalized_mozbuild_filename in all_mozbuild_filenames_normalized:
981        source_assignments, root, code = mozbuild_file_to_source_assignments(
982            normalized_mozbuild_filename
983        )
984        for key in source_assignments:
985            list_of_normalized_filenames = source_assignments[key]
986            all_target_filenames_normalized.extend(list_of_normalized_filenames)
987
988    return all_target_filenames_normalized
989
990
991def try_to_match_target_file(
992    all_mozbuild_filenames_normalized, target_filename_normalized
993):
994    """
995    Runs 'the algorithm' on a target file, and returns if the algorithm was successful
996
997    all_mozbuild_filenames_normalized: the list of all third-party moz.build files
998    target_filename_normalized - the target filename, normalized to the gecko root
999    """
1000
1001    # We do not update the statistics for failed matches, so save a copy
1002    global statistics
1003    backup_statistics = copy.deepcopy(statistics)
1004
1005    if "" == target_filename_normalized:
1006        raise Exception("Received an empty target_filename_normalized")
1007
1008    normalized_mozbuild_filename = get_closest_mozbuild_file(
1009        target_filename_normalized, None, None, all_mozbuild_filenames_normalized
1010    )
1011    if not normalized_mozbuild_filename:
1012        return (False, "No moz.build file found")
1013
1014    source_assignments, root, code = mozbuild_file_to_source_assignments(
1015        normalized_mozbuild_filename
1016    )
1017    possible_assignments = find_all_posible_assignments_from_filename(
1018        source_assignments, target_filename_normalized
1019    )
1020
1021    if len(possible_assignments) == 0:
1022        raise Exception("No possible assignments were found")
1023    elif len(possible_assignments) > 1:
1024        (
1025            best_guess,
1026            (statistic_number_refinements, statistic_length_logic),
1027        ) = guess_best_assignment(possible_assignments, target_filename_normalized)
1028        chosen_source_assignment_location = best_guess
1029
1030        statistics["needed_to_guess"] += 1
1031
1032        if len(possible_assignments) not in statistics["guess_candidates"]:
1033            statistics["guess_candidates"][len(possible_assignments)] = 0
1034        statistics["guess_candidates"][len(possible_assignments)] += 1
1035
1036        if statistic_number_refinements not in statistics["number_refinements"]:
1037            statistics["number_refinements"][statistic_number_refinements] = 0
1038        statistics["number_refinements"][statistic_number_refinements] += 1
1039
1040        if statistic_length_logic not in statistics["length_logic"]:
1041            statistics["length_logic"][statistic_length_logic] = 0
1042        statistics["length_logic"][statistic_length_logic] += 1
1043
1044    else:
1045        chosen_source_assignment_location = list(possible_assignments.keys())[0]
1046
1047    guessed_list_containing_normalized_filenames = possible_assignments[
1048        chosen_source_assignment_location
1049    ]
1050
1051    if target_filename_normalized in guessed_list_containing_normalized_filenames:
1052        return (True, None)
1053
1054    # Restore the copy of the statistics so we don't alter it for failed matches
1055    statistics = backup_statistics
1056    return (False, chosen_source_assignment_location)
1057
1058
1059def get_gecko_root():
1060    """
1061    Using __file__ as a base, find the gecko root
1062    """
1063    gecko_root = None
1064    directory_to_check = os.path.dirname(os.path.abspath(__file__))
1065    while not os.path.isfile(os.path.join(directory_to_check, ".arcconfig")):
1066        directory_to_check = os.path.dirname(directory_to_check)
1067        if directory_to_check == "/":
1068            print("Could not find gecko root")
1069            sys.exit(1)
1070
1071    gecko_root = directory_to_check
1072    return gecko_root
1073
1074
1075def get_all_mozbuild_filenames(gecko_root):
1076    """
1077    Find all the third party moz.build files in the gecko repo
1078    """
1079    third_party_paths = open(
1080        os.path.join(gecko_root, "tools", "rewriting", "ThirdPartyPaths.txt")
1081    ).readlines()
1082    all_mozbuild_filenames_normalized = []
1083    for path in third_party_paths:
1084        # We need shell=True because some paths are specified as globs
1085        # We need an exception handler because sometimes the directory doesn't exist and find barfs
1086        try:
1087            output = subprocess.check_output(
1088                "find %s -name moz.build" % os.path.join(gecko_root, path.strip()),
1089                shell=True,
1090            ).decode("utf-8")
1091            for f in output.split("\n"):
1092                f = f.replace("//", "/").strip().replace(gecko_root, "")[1:]
1093                if f:
1094                    all_mozbuild_filenames_normalized.append(f)
1095        except Exception:
1096            pass
1097
1098    return all_mozbuild_filenames_normalized
1099
1100
1101def test_all_third_party_files(gecko_root, all_mozbuild_filenames_normalized):
1102    """
1103    Run the algorithm on every source file in a third party moz.build file and output the results
1104    """
1105    all_mozbuild_filenames_normalized = [
1106        f for f in all_mozbuild_filenames_normalized if "webrtc" not in f
1107    ]
1108    all_target_filenames_normalized = get_all_target_filenames_normalized(
1109        all_mozbuild_filenames_normalized
1110    )
1111
1112    total_attempted = 0
1113    failed_matched = []
1114    successfully_matched = 0
1115
1116    print("Going to try to match %i files..." % len(all_target_filenames_normalized))
1117    for target_filename_normalized in all_target_filenames_normalized:
1118        result, wrong_guess = try_to_match_target_file(
1119            all_mozbuild_filenames_normalized, target_filename_normalized
1120        )
1121
1122        total_attempted += 1
1123        if result:
1124            successfully_matched += 1
1125        else:
1126            failed_matched.append((target_filename_normalized, wrong_guess))
1127        if total_attempted % 100 == 0:
1128            print("Progress:", total_attempted)
1129
1130    print(
1131        "Successfully Matched %i of %i files" % (successfully_matched, total_attempted)
1132    )
1133    if failed_matched:
1134        print("Failed files:")
1135        for f in failed_matched:
1136            print("\t", f[0], f[1])
1137    print("Statistics:")
1138    pprint(statistics)
1139
1140
1141if __name__ == "__main__":
1142    gecko_root = get_gecko_root()
1143
1144    os.chdir(gecko_root)
1145    add_file_to_moz_build_file(
1146        "third_party/dav1d/src/arm/32/ipred16.S", "media/libdav1d", "third_party/dav1d/"
1147    )
1148
1149    # all_mozbuild_filenames_normalized = get_all_mozbuild_filenames(gecko_root)
1150    # test_all_third_party_files(gecko_root, all_mozbuild_filenames_normalized)
1151