1package enry
2
3import (
4	"bytes"
5	"path/filepath"
6	"regexp"
7	"sort"
8	"strings"
9
10	"github.com/go-enry/go-enry/v2/data"
11	"github.com/go-enry/go-enry/v2/regex"
12)
13
14const binSniffLen = 8000
15
16var configurationLanguages = map[string]struct{}{
17	"XML":  {},
18	"JSON": {},
19	"TOML": {},
20	"YAML": {},
21	"INI":  {},
22	"SQL":  {},
23}
24
25// IsConfiguration tells if filename is in one of the configuration languages.
26func IsConfiguration(path string) bool {
27	language, _ := GetLanguageByExtension(path)
28	_, is := configurationLanguages[language]
29	return is
30}
31
32// IsImage tells if a given file is an image (PNG, JPEG or GIF format).
33func IsImage(path string) bool {
34	extension := filepath.Ext(path)
35	if extension == ".png" || extension == ".jpg" || extension == ".jpeg" || extension == ".gif" {
36		return true
37	}
38
39	return false
40}
41
42// GetMIMEType returns a MIME type of a given file based on its languages.
43func GetMIMEType(path string, language string) string {
44	if mime, ok := data.LanguagesMime[language]; ok {
45		return mime
46	}
47
48	if IsImage(path) {
49		return "image/" + filepath.Ext(path)[1:]
50	}
51
52	return "text/plain"
53}
54
55// IsDocumentation returns whether or not path is a documentation path.
56func IsDocumentation(path string) bool {
57	return matchRegexSlice(data.DocumentationMatchers, path)
58}
59
60// IsDotFile returns whether or not path has dot as a prefix.
61func IsDotFile(path string) bool {
62	base := filepath.Base(filepath.Clean(path))
63	return strings.HasPrefix(base, ".") && base != "."
64}
65
66var isVendorRegExp *regexp.Regexp
67
68// IsVendor returns whether or not path is a vendor path.
69func IsVendor(path string) bool {
70	return isVendorRegExp.MatchString(path)
71}
72
73// IsTest returns whether or not path is a test path.
74func IsTest(path string) bool {
75	return matchRegexSlice(data.TestMatchers, path)
76}
77
78// IsBinary detects if data is a binary value based on:
79// http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198
80func IsBinary(data []byte) bool {
81	if len(data) > binSniffLen {
82		data = data[:binSniffLen]
83	}
84
85	if bytes.IndexByte(data, byte(0)) == -1 {
86		return false
87	}
88
89	return true
90}
91
92// GetColor returns a HTML color code of a given language.
93func GetColor(language string) string {
94	if color, ok := data.LanguagesColor[language]; ok {
95		return color
96	}
97
98	if color, ok := data.LanguagesColor[GetLanguageGroup(language)]; ok {
99		return color
100	}
101
102	return "#cccccc"
103}
104
105func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool {
106	for _, expr := range exprs {
107		if expr.MatchString(str) {
108			return true
109		}
110	}
111
112	return false
113}
114
115// IsGenerated returns whether the file with the given path and content is a
116// generated file.
117func IsGenerated(path string, content []byte) bool {
118	ext := strings.ToLower(filepath.Ext(path))
119	if _, ok := data.GeneratedCodeExtensions[ext]; ok {
120		return true
121	}
122
123	for _, m := range data.GeneratedCodeNameMatchers {
124		if m(path) {
125			return true
126		}
127	}
128
129	path = strings.ToLower(path)
130	for _, m := range data.GeneratedCodeMatchers {
131		if m(path, ext, content) {
132			return true
133		}
134	}
135
136	return false
137}
138
139func init() {
140	// We now collate the individual regexps that make up the VendorMatchers to
141	// produce a single large regexp which is around twice as fast to test than
142	// simply iterating through all the regexps or naïvely collating the
143	// regexps.
144	//
145	// ---
146	//
147	// data.VendorMatchers here is a slice containing individual regexps that
148	// match a vendor file therefore if we want to test if a filename is a
149	// Vendor we need to test whether that filename matches one or more of
150	// those regexps.
151	//
152	// Now we could test each matcher in turn using a shortcircuiting test i.e.
153	//
154	//  	func IsVendor(filename string) bool {
155	// 			for _, matcher := range data.VendorMatchers {
156	// 				if matcher.Match(filename) {
157	//					return true
158	//				}
159	//			}
160	//			return false
161	//		}
162	//
163	// Or concatentate all these regexps using groups i.e.
164	//
165	//		`(regexp1)|(regexp2)|(regexp3)|...`
166	//
167	// However both of these are relatively slow and they don't take advantage
168	// of the inherent structure within our regexps...
169	//
170	// If we look at our regexps there are essentially three types of regexp:
171	//
172	// 1. Those that start with `^`
173	// 2. Those that start with `(^|/)`
174	// 3. Others
175	//
176	// If we collate our regexps into these groups that will significantly
177	// reduce the likelihood of backtracking within the regexp trie matcher.
178	//
179	// A further improvement is to use non-capturing groups as otherwise the
180	// regexp parser, whilst matching, will have to allocate slices for
181	// matching positions. (A future improvement here could be in the use of
182	// enforcing non-capturing groups within the sub-regexps too.)
183	//
184	// Finally if we sort the segments we can help the matcher build a more
185	// efficient matcher and trie.
186
187	// alias the VendorMatchers to simplify things
188	matchers := data.VendorMatchers
189
190	// Create three temporary string slices for our three groups above - prefixes removed
191	caretStrings := make([]string, 0, 10)
192	caretSegmentStrings := make([]string, 0, 10)
193	matcherStrings := make([]string, 0, len(matchers))
194
195	// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
196	for _, matcher := range matchers {
197		str := matcher.String()
198		if str[0] == '^' {
199			caretStrings = append(caretStrings, str[1:])
200		} else if str[0:5] == "(^|/)" {
201			caretSegmentStrings = append(caretSegmentStrings, str[5:])
202		} else {
203			matcherStrings = append(matcherStrings, str)
204		}
205	}
206
207	// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
208	sort.Strings(caretSegmentStrings)
209	sort.Strings(caretStrings)
210	sort.Strings(matcherStrings)
211
212	// Now build the collated regexp
213	sb := &strings.Builder{}
214
215	// Start with group 1 - those that started with `^`
216	sb.WriteString("(?:^(?:")
217	sb.WriteString(caretStrings[0])
218	for _, matcher := range caretStrings[1:] {
219		sb.WriteString(")|(?:")
220		sb.WriteString(matcher)
221	}
222	sb.WriteString("))")
223	sb.WriteString("|")
224
225	// Now add group 2 - those that started with `(^|/)`
226	sb.WriteString("(?:(?:^|/)(?:")
227	sb.WriteString(caretSegmentStrings[0])
228	for _, matcher := range caretSegmentStrings[1:] {
229		sb.WriteString(")|(?:")
230		sb.WriteString(matcher)
231	}
232	sb.WriteString("))")
233	sb.WriteString("|")
234
235	// Finally add the rest
236	sb.WriteString("(?:")
237	sb.WriteString(matcherStrings[0])
238	for _, matcher := range matcherStrings[1:] {
239		sb.WriteString(")|(?:")
240		sb.WriteString(matcher)
241	}
242	sb.WriteString(")")
243
244	// Compile the whole thing as the isVendorRegExp
245	isVendorRegExp = regexp.MustCompile(sb.String())
246}
247