1# coding=utf-8
2
3from collections import namedtuple
4import os
5import re
6
7import spdx
8from spdx import License
9
10LicenseMatch = namedtuple('LicenseMatch',
11        ['confidence', 'license', 'filename'])
12
13_id_idx = {}
14_name_idx = {}
15
16for n, record in enumerate(spdx._licenses):
17    _name_idx[record['name'].lower().strip()] = n
18    _id_idx[record['id'].lower().strip()] = n
19
20
21_word_set_re = re.compile(r"([A-Za-z]+|[A-Za-z]+'[A-Za-z]+)")
22_copyright_re = re.compile(r'\s*Copyright\s*(©|\(c\)|\xC2\xA9)?' +
23                            '\s*(\d{4}|.year.)(.*)?' +
24                            '(?:\s*All rights reserved).*', re.I | re.M)
25_spdx_var_re = re.compile(r'<<var;name=(.*?);' +
26                           'original=(.*?);match=(.*?)>>', re.I | re.M)
27
28
29def _spdx_var_orig(match):
30    return match.group(2)
31
32
33def _spdx_var_match(match):
34    return match.group(3)
35
36
37def _get_word_set(content, spdx=False):
38    x = _copyright_re.sub('', content.lower())
39    if spdx:
40        x = _spdx_var_re.sub(_spdx_var_orig, x)
41    return set(_word_set_re.findall(x))
42
43
44def by_name(name):
45    q = name.strip().lower()
46    i = _name_idx.get(q, None)
47
48    if i is not None:
49        return License(spdx._licenses[i])
50
51
52def by_id(id_):
53    q = id_.strip().lower()
54    i = _id_idx.get(q, None)
55
56    if i is not None:
57        return License(spdx._licenses[i])
58
59
60# Special cases for matching
61_hidden = {
62    'BSD-2-Clause-FreeBSD',
63    'BSD-2-Clause-NetBSD',
64    'Mup'
65}
66
67
68def _match_all(content, threshold=90, include_hidden=False):
69    word_set = _get_word_set(content)
70    max_delta = len(word_set) * threshold/100.0
71
72    potentials = []
73    for l in spdx._licenses:
74        license = License(l)
75        if license.id in _hidden and not include_hidden:
76            continue
77
78        license_ws = _get_word_set(license.template, True)
79        delta = abs(len(word_set) - len(license_ws))
80
81        if delta <= max_delta:
82            potentials.append((delta, license, license_ws))
83
84    potentials.sort(key=lambda x: x[0])
85    potentials.reverse()
86
87    matches = []
88    for _, license, license_ws in potentials:
89        overlap = len(word_set & license_ws)
90        total = len(word_set) + len(license_ws)
91        similarity = 100.0 * (overlap * 2.0 / total)
92
93        if similarity >= threshold:
94            matches.append((similarity, license))
95
96    return sorted(matches, key=lambda x: x[0])
97
98
99def match(content, threshold=90, include_hidden=False):
100    matches = _match_all(content, threshold, include_hidden)
101    if len(matches) == 0:
102        return None
103
104    m = matches.pop()
105    return LicenseMatch(m[0], m[1], None)
106
107_license_fn_res = [
108    re.compile('^(un)?licen[sc]e$', re.I),
109    re.compile('^(un)?licen[sc]e\.(md|markdown|txt)$', re.I),
110    re.compile('^copy(ing|right)(\.[^.]+)?$', re.I),
111    re.compile('^(un)?licen[sc]e\.[^.]+$', re.I),
112    re.compile('licen[sc]e', re.I)
113]
114
115def _file_score(path):
116    fn = os.path.basename(path)
117
118    for n, regex in enumerate(_license_fn_res):
119        if regex.match(fn):
120            return len(_license_fn_res) - n
121    return 0
122
123def match_path(path, threshold=90, include_hidden=False):
124    if not os.path.isdir(path):
125        raise ValueError("Path must be a directory")
126
127    x = []
128    for fn in os.listdir(path):
129        fnp = os.path.join(path, fn)
130        score = _file_score(fnp)
131        if score > 0:
132            x.append((score, fn))
133
134    if len(x) == 0:
135        return None
136
137    x.sort(key=lambda x: x[0])
138
139    fn = x.pop()[1]
140    with open(os.path.join(path, fn)) as f:
141        matches = _match_all(f.read(), threshold, include_hidden)
142        if len(matches) > 0:
143            m = matches[0]
144            return LicenseMatch(m[0], m[1], fn)
145
146