1package enry 2 3import ( 4 "bytes" 5 "path/filepath" 6 "regexp" 7 "sort" 8 "strings" 9 10 "github.com/go-enry/go-enry/v2/data" 11 "github.com/go-enry/go-enry/v2/regex" 12) 13 14const binSniffLen = 8000 15 16var configurationLanguages = map[string]struct{}{ 17 "XML": {}, 18 "JSON": {}, 19 "TOML": {}, 20 "YAML": {}, 21 "INI": {}, 22 "SQL": {}, 23} 24 25// IsConfiguration tells if filename is in one of the configuration languages. 26func IsConfiguration(path string) bool { 27 language, _ := GetLanguageByExtension(path) 28 _, is := configurationLanguages[language] 29 return is 30} 31 32// IsImage tells if a given file is an image (PNG, JPEG or GIF format). 33func IsImage(path string) bool { 34 extension := filepath.Ext(path) 35 if extension == ".png" || extension == ".jpg" || extension == ".jpeg" || extension == ".gif" { 36 return true 37 } 38 39 return false 40} 41 42// GetMIMEType returns a MIME type of a given file based on its languages. 43func GetMIMEType(path string, language string) string { 44 if mime, ok := data.LanguagesMime[language]; ok { 45 return mime 46 } 47 48 if IsImage(path) { 49 return "image/" + filepath.Ext(path)[1:] 50 } 51 52 return "text/plain" 53} 54 55// IsDocumentation returns whether or not path is a documentation path. 56func IsDocumentation(path string) bool { 57 return matchRegexSlice(data.DocumentationMatchers, path) 58} 59 60// IsDotFile returns whether or not path has dot as a prefix. 61func IsDotFile(path string) bool { 62 base := filepath.Base(filepath.Clean(path)) 63 return strings.HasPrefix(base, ".") && base != "." 64} 65 66var isVendorRegExp *regexp.Regexp 67 68// IsVendor returns whether or not path is a vendor path. 69func IsVendor(path string) bool { 70 return isVendorRegExp.MatchString(path) 71} 72 73// IsTest returns whether or not path is a test path. 74func IsTest(path string) bool { 75 return matchRegexSlice(data.TestMatchers, path) 76} 77 78// IsBinary detects if data is a binary value based on: 79// http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198 80func IsBinary(data []byte) bool { 81 if len(data) > binSniffLen { 82 data = data[:binSniffLen] 83 } 84 85 if bytes.IndexByte(data, byte(0)) == -1 { 86 return false 87 } 88 89 return true 90} 91 92// GetColor returns a HTML color code of a given language. 93func GetColor(language string) string { 94 if color, ok := data.LanguagesColor[language]; ok { 95 return color 96 } 97 98 if color, ok := data.LanguagesColor[GetLanguageGroup(language)]; ok { 99 return color 100 } 101 102 return "#cccccc" 103} 104 105func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool { 106 for _, expr := range exprs { 107 if expr.MatchString(str) { 108 return true 109 } 110 } 111 112 return false 113} 114 115// IsGenerated returns whether the file with the given path and content is a 116// generated file. 117func IsGenerated(path string, content []byte) bool { 118 ext := strings.ToLower(filepath.Ext(path)) 119 if _, ok := data.GeneratedCodeExtensions[ext]; ok { 120 return true 121 } 122 123 for _, m := range data.GeneratedCodeNameMatchers { 124 if m(path) { 125 return true 126 } 127 } 128 129 path = strings.ToLower(path) 130 for _, m := range data.GeneratedCodeMatchers { 131 if m(path, ext, content) { 132 return true 133 } 134 } 135 136 return false 137} 138 139func init() { 140 // We now collate the individual regexps that make up the VendorMatchers to 141 // produce a single large regexp which is around twice as fast to test than 142 // simply iterating through all the regexps or naïvely collating the 143 // regexps. 144 // 145 // --- 146 // 147 // data.VendorMatchers here is a slice containing individual regexps that 148 // match a vendor file therefore if we want to test if a filename is a 149 // Vendor we need to test whether that filename matches one or more of 150 // those regexps. 151 // 152 // Now we could test each matcher in turn using a shortcircuiting test i.e. 153 // 154 // func IsVendor(filename string) bool { 155 // for _, matcher := range data.VendorMatchers { 156 // if matcher.Match(filename) { 157 // return true 158 // } 159 // } 160 // return false 161 // } 162 // 163 // Or concatentate all these regexps using groups i.e. 164 // 165 // `(regexp1)|(regexp2)|(regexp3)|...` 166 // 167 // However both of these are relatively slow and they don't take advantage 168 // of the inherent structure within our regexps... 169 // 170 // If we look at our regexps there are essentially three types of regexp: 171 // 172 // 1. Those that start with `^` 173 // 2. Those that start with `(^|/)` 174 // 3. Others 175 // 176 // If we collate our regexps into these groups that will significantly 177 // reduce the likelihood of backtracking within the regexp trie matcher. 178 // 179 // A further improvement is to use non-capturing groups as otherwise the 180 // regexp parser, whilst matching, will have to allocate slices for 181 // matching positions. (A future improvement here could be in the use of 182 // enforcing non-capturing groups within the sub-regexps too.) 183 // 184 // Finally if we sort the segments we can help the matcher build a more 185 // efficient matcher and trie. 186 187 // alias the VendorMatchers to simplify things 188 matchers := data.VendorMatchers 189 190 // Create three temporary string slices for our three groups above - prefixes removed 191 caretStrings := make([]string, 0, 10) 192 caretSegmentStrings := make([]string, 0, 10) 193 matcherStrings := make([]string, 0, len(matchers)) 194 195 // Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices 196 for _, matcher := range matchers { 197 str := matcher.String() 198 if str[0] == '^' { 199 caretStrings = append(caretStrings, str[1:]) 200 } else if str[0:5] == "(^|/)" { 201 caretSegmentStrings = append(caretSegmentStrings, str[5:]) 202 } else { 203 matcherStrings = append(matcherStrings, str) 204 } 205 } 206 207 // Sort the strings within each group - a potential further improvement could be in simplifying within these groups 208 sort.Strings(caretSegmentStrings) 209 sort.Strings(caretStrings) 210 sort.Strings(matcherStrings) 211 212 // Now build the collated regexp 213 sb := &strings.Builder{} 214 215 // Start with group 1 - those that started with `^` 216 sb.WriteString("(?:^(?:") 217 sb.WriteString(caretStrings[0]) 218 for _, matcher := range caretStrings[1:] { 219 sb.WriteString(")|(?:") 220 sb.WriteString(matcher) 221 } 222 sb.WriteString("))") 223 sb.WriteString("|") 224 225 // Now add group 2 - those that started with `(^|/)` 226 sb.WriteString("(?:(?:^|/)(?:") 227 sb.WriteString(caretSegmentStrings[0]) 228 for _, matcher := range caretSegmentStrings[1:] { 229 sb.WriteString(")|(?:") 230 sb.WriteString(matcher) 231 } 232 sb.WriteString("))") 233 sb.WriteString("|") 234 235 // Finally add the rest 236 sb.WriteString("(?:") 237 sb.WriteString(matcherStrings[0]) 238 for _, matcher := range matcherStrings[1:] { 239 sb.WriteString(")|(?:") 240 sb.WriteString(matcher) 241 } 242 sb.WriteString(")") 243 244 // Compile the whole thing as the isVendorRegExp 245 isVendorRegExp = regexp.MustCompile(sb.String()) 246} 247