1# coding=utf-8 2 3from collections import namedtuple 4import os 5import re 6 7import spdx 8from spdx import License 9 10LicenseMatch = namedtuple('LicenseMatch', 11 ['confidence', 'license', 'filename']) 12 13_id_idx = {} 14_name_idx = {} 15 16for n, record in enumerate(spdx._licenses): 17 _name_idx[record['name'].lower().strip()] = n 18 _id_idx[record['id'].lower().strip()] = n 19 20 21_word_set_re = re.compile(r"([A-Za-z]+|[A-Za-z]+'[A-Za-z]+)") 22_copyright_re = re.compile(r'\s*Copyright\s*(©|\(c\)|\xC2\xA9)?' + 23 '\s*(\d{4}|.year.)(.*)?' + 24 '(?:\s*All rights reserved).*', re.I | re.M) 25_spdx_var_re = re.compile(r'<<var;name=(.*?);' + 26 'original=(.*?);match=(.*?)>>', re.I | re.M) 27 28 29def _spdx_var_orig(match): 30 return match.group(2) 31 32 33def _spdx_var_match(match): 34 return match.group(3) 35 36 37def _get_word_set(content, spdx=False): 38 x = _copyright_re.sub('', content.lower()) 39 if spdx: 40 x = _spdx_var_re.sub(_spdx_var_orig, x) 41 return set(_word_set_re.findall(x)) 42 43 44def by_name(name): 45 q = name.strip().lower() 46 i = _name_idx.get(q, None) 47 48 if i is not None: 49 return License(spdx._licenses[i]) 50 51 52def by_id(id_): 53 q = id_.strip().lower() 54 i = _id_idx.get(q, None) 55 56 if i is not None: 57 return License(spdx._licenses[i]) 58 59 60# Special cases for matching 61_hidden = { 62 'BSD-2-Clause-FreeBSD', 63 'BSD-2-Clause-NetBSD', 64 'Mup' 65} 66 67 68def _match_all(content, threshold=90, include_hidden=False): 69 word_set = _get_word_set(content) 70 max_delta = len(word_set) * threshold/100.0 71 72 potentials = [] 73 for l in spdx._licenses: 74 license = License(l) 75 if license.id in _hidden and not include_hidden: 76 continue 77 78 license_ws = _get_word_set(license.template, True) 79 delta = abs(len(word_set) - len(license_ws)) 80 81 if delta <= max_delta: 82 potentials.append((delta, license, license_ws)) 83 84 potentials.sort(key=lambda x: x[0]) 85 potentials.reverse() 86 87 matches = [] 88 for _, license, license_ws in potentials: 89 overlap = len(word_set & license_ws) 90 total = len(word_set) + len(license_ws) 91 similarity = 100.0 * (overlap * 2.0 / total) 92 93 if similarity >= threshold: 94 matches.append((similarity, license)) 95 96 return sorted(matches, key=lambda x: x[0]) 97 98 99def match(content, threshold=90, include_hidden=False): 100 matches = _match_all(content, threshold, include_hidden) 101 if len(matches) == 0: 102 return None 103 104 m = matches.pop() 105 return LicenseMatch(m[0], m[1], None) 106 107_license_fn_res = [ 108 re.compile('^(un)?licen[sc]e$', re.I), 109 re.compile('^(un)?licen[sc]e\.(md|markdown|txt)$', re.I), 110 re.compile('^copy(ing|right)(\.[^.]+)?$', re.I), 111 re.compile('^(un)?licen[sc]e\.[^.]+$', re.I), 112 re.compile('licen[sc]e', re.I) 113] 114 115def _file_score(path): 116 fn = os.path.basename(path) 117 118 for n, regex in enumerate(_license_fn_res): 119 if regex.match(fn): 120 return len(_license_fn_res) - n 121 return 0 122 123def match_path(path, threshold=90, include_hidden=False): 124 if not os.path.isdir(path): 125 raise ValueError("Path must be a directory") 126 127 x = [] 128 for fn in os.listdir(path): 129 fnp = os.path.join(path, fn) 130 score = _file_score(fnp) 131 if score > 0: 132 x.append((score, fn)) 133 134 if len(x) == 0: 135 return None 136 137 x.sort(key=lambda x: x[0]) 138 139 fn = x.pop()[1] 140 with open(os.path.join(path, fn)) as f: 141 matches = _match_all(f.read(), threshold, include_hidden) 142 if len(matches) > 0: 143 m = matches[0] 144 return LicenseMatch(m[0], m[1], fn) 145 146