1package data
2
3import (
4	"bytes"
5	"strings"
6
7	"github.com/go-enry/go-enry/v2/regex"
8)
9
10// GeneratedCodeExtensions contains all extensions that belong to generated
11// files for sure.
12var GeneratedCodeExtensions = map[string]struct{}{
13	// XCode files
14	".nib":             {},
15	".xcworkspacedata": {},
16	".xcuserstate":     {},
17}
18
19// GeneratedCodeNameMatcher is a function that tells whether the file with the
20// given name is generated.
21type GeneratedCodeNameMatcher func(string) bool
22
23func nameMatches(pattern string) GeneratedCodeNameMatcher {
24	r := regex.MustCompile(pattern)
25	return func(name string) bool {
26		return r.MatchString(name)
27	}
28}
29
30func nameContains(pattern string) GeneratedCodeNameMatcher {
31	return func(name string) bool {
32		return strings.Contains(name, pattern)
33	}
34}
35
36func nameEndsWith(pattern string) GeneratedCodeNameMatcher {
37	return func(name string) bool {
38		return strings.HasSuffix(name, pattern)
39	}
40}
41
42// GeneratedCodeNameMatchers are all the matchers that check whether the code
43// is generated based only on the file name.
44var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{
45	// Cocoa pods
46	nameMatches(`(^Pods|\/Pods)\/`),
47
48	// Carthage build
49	nameMatches(`(^|\/)Carthage\/Build\/`),
50
51	// NET designer file
52	nameMatches(`(?i)\.designer\.(cs|vb)$`),
53
54	// Generated NET specflow feature file
55	nameEndsWith(".feature.cs"),
56
57	// Node modules
58	nameContains("node_modules/"),
59
60	// Go vendor
61	nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`),
62
63	// Go lock
64	nameEndsWith("Gopkg.lock"),
65	nameEndsWith("glide.lock"),
66
67	// Esy lock
68	nameMatches(`(^|\/)(\w+\.)?esy.lock$`),
69
70	// NPM shrinkwrap
71	nameEndsWith("npm-shrinkwrap.json"),
72
73	// NPM package lock
74	nameEndsWith("package-lock.json"),
75
76	// Yarn plugnplay
77	nameMatches(`(^|\/)\.pnp\.(c|m)?js$`),
78
79	// Godeps
80	nameContains("Godeps/"),
81
82	// Composer lock
83	nameEndsWith("composer.lock"),
84
85	// Generated by zephir
86	nameMatches(`.\.zep\.(?:c|h|php)$`),
87
88	// Cargo lock
89	nameEndsWith("Cargo.lock"),
90
91	// Pipenv lock
92	nameEndsWith("Pipfile.lock"),
93
94	// GraphQL relay
95	nameContains("__generated__/"),
96}
97
98// GeneratedCodeMatcher checks whether the file with the given data is
99// generated code.
100type GeneratedCodeMatcher func(path, ext string, content []byte) bool
101
102// GeneratedCodeMatchers is the list of all generated code matchers that
103// rely on checking the content of the file to make the guess.
104var GeneratedCodeMatchers = []GeneratedCodeMatcher{
105	isMinifiedFile,
106	hasSourceMapReference,
107	isSourceMap,
108	isCompiledCoffeeScript,
109	isGeneratedNetDocfile,
110	isGeneratedJavaScriptPEGParser,
111	isGeneratedPostScript,
112	isGeneratedGo,
113	isGeneratedProtobuf,
114	isGeneratedJavaScriptProtocolBuffer,
115	isGeneratedApacheThrift,
116	isGeneratedJNIHeader,
117	isVCRCassette,
118	isCompiledCythonFile,
119	isGeneratedModule,
120	isGeneratedUnity3DMeta,
121	isGeneratedRacc,
122	isGeneratedJFlex,
123	isGeneratedGrammarKit,
124	isGeneratedRoxygen2,
125	isGeneratedJison,
126	isGeneratedGRPCCpp,
127	isGeneratedDart,
128	isGeneratedPerlPPPortHeader,
129	isGeneratedGameMakerStudio,
130	isGeneratedGimp,
131	isGeneratedVisualStudio6,
132	isGeneratedHaxe,
133	isGeneratedHTML,
134	isGeneratedJooq,
135}
136
137func canBeMinified(ext string) bool {
138	return ext == ".js" || ext == ".css"
139}
140
141// isMinifiedFile returns whether the file may be minified.
142// We consider a minified file any css or js file whose average number of chars
143// per line is more than 110.
144func isMinifiedFile(path, ext string, content []byte) bool {
145	if !canBeMinified(ext) {
146		return false
147	}
148
149	var chars, lines uint64
150	forEachLine(content, func(line []byte) {
151		chars += uint64(len(line))
152		lines++
153	})
154
155	if lines == 0 {
156		return false
157	}
158
159	return chars/lines > 110
160}
161
162var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`)
163
164// hasSourceMapReference returns whether the file contains a reference to a
165// source-map file.
166func hasSourceMapReference(_ string, ext string, content []byte) bool {
167	if !canBeMinified(ext) {
168		return false
169	}
170
171	for _, line := range getLines(content, -2) {
172		if sourceMapRegex.Match(line) {
173			return true
174		}
175	}
176
177	return false
178}
179
180var sourceMapRegexps = []regex.EnryRegexp{
181	regex.MustCompile(`^{"version":\d+,`),
182	regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`),
183}
184
185// isSourceMap returns whether the file itself is a source map.
186func isSourceMap(path, _ string, content []byte) bool {
187	if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") {
188		return true
189	}
190
191	firstLine := getFirstLine(content)
192	if len(firstLine) == 0 {
193		return false
194	}
195
196	for _, r := range sourceMapRegexps {
197		if r.Match(firstLine) {
198			return true
199		}
200	}
201
202	return false
203}
204
205func isCompiledCoffeeScript(path, ext string, content []byte) bool {
206	if ext != ".js" {
207		return false
208	}
209
210	firstLine := getFirstLine(content)
211	lastLines := getLines(content, -2)
212	if len(lastLines) < 2 {
213		return false
214	}
215
216	if string(firstLine) == "(function() {" &&
217		string(lastLines[1]) == "}).call(this);" &&
218		string(lastLines[0]) == "" {
219		score := 0
220
221		forEachLine(content, func(line []byte) {
222			if bytes.Contains(line, []byte("var ")) {
223				// Underscored temp vars are likely to be Coffee
224				score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results")
225
226				// bind and extend functions are very Coffee specific
227				score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice")
228			}
229		})
230
231		// Require a score of 3. This is fairly abritrary. Consider tweaking later.
232		// See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213
233		return score >= 3
234	}
235
236	return false
237}
238
239func isGeneratedNetDocfile(_, ext string, content []byte) bool {
240	if ext != ".xml" {
241		return false
242	}
243
244	lines := bytes.Split(content, []byte{'\n'})
245	if len(lines) <= 3 {
246		return false
247	}
248
249	return bytes.Contains(lines[1], []byte("<doc>")) &&
250		bytes.Contains(lines[2], []byte("<assembly>")) &&
251		bytes.Contains(lines[len(lines)-2], []byte("</doc>"))
252}
253
254var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`)
255
256func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool {
257	if ext != ".js" {
258		return false
259	}
260
261	// PEG.js-generated parsers include a comment near the top  of the file
262	// that marks them as such.
263	return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte("")))
264}
265
266var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`)
267
268var postScriptRegexes = []regex.EnryRegexp{
269	regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`),
270	regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`),
271}
272
273func isGeneratedPostScript(_, ext string, content []byte) bool {
274	if ext != ".ps" && ext != ".eps" && ext != ".pfa" {
275		return false
276	}
277
278	// Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these
279	// streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42).
280	if postScriptType1And42Regex.Match(content) {
281		return true
282	}
283
284	// We analyze the "%%Creator:" comment, which contains the author/generator
285	// of the file. If there is one, it should be in one of the first few lines.
286	var creator []byte
287	for _, line := range getLines(content, 10) {
288		if bytes.HasPrefix(line, []byte("%%Creator: ")) {
289			creator = line
290			break
291		}
292	}
293
294	if len(creator) == 0 {
295		return false
296	}
297
298	// EAGLE doesn't include a version number when it generates PostScript.
299	// However, it does prepend its name to the document's "%%Title" field.
300	if bytes.Contains(creator, []byte("EAGLE")) {
301		for _, line := range getLines(content, 5) {
302			if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) {
303				return true
304			}
305		}
306	}
307
308	// Most generators write their version number, while human authors' or companies'
309	// names don't contain numbers. So look if the line contains digits. Also
310	// look for some special cases without version numbers.
311	for _, r := range postScriptRegexes {
312		if r.Match(creator) {
313			return true
314		}
315	}
316
317	return false
318}
319
320func isGeneratedGo(_, ext string, content []byte) bool {
321	if ext != ".go" {
322		return false
323	}
324
325	lines := getLines(content, 40)
326	if len(lines) <= 1 {
327		return false
328	}
329
330	for _, line := range lines {
331		if bytes.Contains(line, []byte("Code generated by")) {
332			return true
333		}
334	}
335
336	return false
337}
338
339var protoExtensions = map[string]struct{}{
340	".py":   {},
341	".java": {},
342	".h":    {},
343	".cc":   {},
344	".cpp":  {},
345	".m":    {},
346	".rb":   {},
347	".php":  {},
348}
349
350func isGeneratedProtobuf(_, ext string, content []byte) bool {
351	if _, ok := protoExtensions[ext]; !ok {
352		return false
353	}
354
355	lines := getLines(content, 3)
356	if len(lines) <= 1 {
357		return false
358	}
359
360	for _, line := range lines {
361		if bytes.Contains(line, []byte("Generated by the protocol buffer compiler.  DO NOT EDIT!")) {
362			return true
363		}
364	}
365
366	return false
367}
368
369func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool {
370	if ext != ".js" {
371		return false
372	}
373
374	lines := getLines(content, 6)
375	if len(lines) < 6 {
376		return false
377	}
378
379	return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!"))
380}
381
382var apacheThriftExtensions = map[string]struct{}{
383	".rb":   {},
384	".py":   {},
385	".go":   {},
386	".js":   {},
387	".m":    {},
388	".java": {},
389	".h":    {},
390	".cc":   {},
391	".cpp":  {},
392	".php":  {},
393}
394
395func isGeneratedApacheThrift(_, ext string, content []byte) bool {
396	if _, ok := apacheThriftExtensions[ext]; !ok {
397		return false
398	}
399
400	for _, line := range getLines(content, 6) {
401		if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) {
402			return true
403		}
404	}
405
406	return false
407}
408
409func isGeneratedJNIHeader(_, ext string, content []byte) bool {
410	if ext != ".h" {
411		return false
412	}
413
414	lines := getLines(content, 2)
415	if len(lines) < 2 {
416		return false
417	}
418
419	return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) &&
420		bytes.Contains(lines[1], []byte("#include <jni.h>"))
421}
422
423func isVCRCassette(_, ext string, content []byte) bool {
424	if ext != ".yml" {
425		return false
426	}
427
428	lines := getLines(content, -2)
429	if len(lines) < 2 {
430		return false
431	}
432
433	return bytes.Contains(lines[1], []byte("recorded_with: VCR"))
434}
435
436func isCompiledCythonFile(_, ext string, content []byte) bool {
437	if ext != ".c" && ext != ".cpp" {
438		return false
439	}
440
441	lines := getLines(content, 1)
442	if len(lines) < 1 {
443		return false
444	}
445
446	return bytes.Contains(lines[0], []byte("Generated by Cython"))
447}
448
449func isGeneratedModule(_, ext string, content []byte) bool {
450	if ext != ".mod" {
451		return false
452	}
453
454	lines := getLines(content, 1)
455	if len(lines) < 1 {
456		return false
457	}
458
459	return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) ||
460		bytes.Contains(lines[0], []byte("GFORTRAN module version '"))
461}
462
463func isGeneratedUnity3DMeta(_, ext string, content []byte) bool {
464	if ext != ".meta" {
465		return false
466	}
467
468	lines := getLines(content, 1)
469	if len(lines) < 1 {
470		return false
471	}
472
473	return bytes.Contains(lines[0], []byte("fileFormatVersion: "))
474}
475
476func isGeneratedRacc(_, ext string, content []byte) bool {
477	if ext != ".rb" {
478		return false
479	}
480
481	lines := getLines(content, 3)
482	if len(lines) < 3 {
483		return false
484	}
485
486	return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc"))
487}
488
489func isGeneratedJFlex(_, ext string, content []byte) bool {
490	if ext != ".java" {
491		return false
492	}
493
494	lines := getLines(content, 1)
495	if len(lines) < 1 {
496		return false
497	}
498
499	return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex "))
500}
501
502func isGeneratedGrammarKit(_, ext string, content []byte) bool {
503	if ext != ".java" {
504		return false
505	}
506
507	lines := getLines(content, 1)
508	if len(lines) < 1 {
509		return false
510	}
511
512	return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing."))
513}
514
515func isGeneratedRoxygen2(_, ext string, content []byte) bool {
516	if ext != ".rd" {
517		return false
518	}
519
520	lines := getLines(content, 1)
521	if len(lines) < 1 {
522		return false
523	}
524
525	return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand"))
526}
527
528func isGeneratedJison(_, ext string, content []byte) bool {
529	if ext != ".js" {
530		return false
531	}
532
533	lines := getLines(content, 1)
534	if len(lines) < 1 {
535		return false
536	}
537
538	return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) ||
539		bytes.Contains(lines[0], []byte("/* generated by jison-lex "))
540}
541
542func isGeneratedGRPCCpp(_, ext string, content []byte) bool {
543	switch ext {
544	case ".cpp", ".hpp", ".h", ".cc":
545		lines := getLines(content, 1)
546		if len(lines) < 1 {
547			return false
548		}
549
550		return bytes.Contains(lines[0], []byte("// Generated by the gRPC"))
551	default:
552		return false
553	}
554}
555
556var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`)
557
558func isGeneratedDart(_, ext string, content []byte) bool {
559	if ext != ".dart" {
560		return false
561	}
562
563	lines := getLines(content, 1)
564	if len(lines) < 1 {
565		return false
566	}
567
568	return dartRegex.Match(bytes.ToLower(lines[0]))
569}
570
571func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool {
572	if !strings.HasSuffix(name, "ppport.h") {
573		return false
574	}
575
576	lines := getLines(content, 10)
577	if len(lines) < 10 {
578		return false
579	}
580
581	return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort"))
582}
583
584var (
585	gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`)
586	gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`)
587)
588
589func isGeneratedGameMakerStudio(_, ext string, content []byte) bool {
590	if ext != ".yy" && ext != ".yyp" {
591		return false
592	}
593
594	lines := getLines(content, 3)
595	if len(lines) < 3 {
596		return false
597	}
598
599	return gameMakerStudioThirdLineRegex.Match(lines[2]) ||
600		gameMakerStudioFirstLineRegex.Match(lines[0])
601}
602
603var gimpRegexes = []regex.EnryRegexp{
604	regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`),
605	regex.MustCompile(`\/\*  GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h  \*\/`),
606}
607
608func isGeneratedGimp(_, ext string, content []byte) bool {
609	if ext != ".c" && ext != ".h" {
610		return false
611	}
612
613	lines := getLines(content, 1)
614	if len(lines) < 1 {
615		return false
616	}
617
618	for _, r := range gimpRegexes {
619		if r.Match(lines[0]) {
620			return true
621		}
622	}
623
624	return false
625}
626
627func isGeneratedVisualStudio6(_, ext string, content []byte) bool {
628	if ext != ".dsp" {
629		return false
630	}
631
632	for _, l := range getLines(content, 3) {
633		if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) {
634			return true
635		}
636	}
637
638	return false
639}
640
641var haxeExtensions = map[string]struct{}{
642	".js":   {},
643	".py":   {},
644	".lua":  {},
645	".cpp":  {},
646	".h":    {},
647	".java": {},
648	".cs":   {},
649	".php":  {},
650}
651
652func isGeneratedHaxe(_, ext string, content []byte) bool {
653	if _, ok := haxeExtensions[ext]; !ok {
654		return false
655	}
656
657	for _, l := range getLines(content, 3) {
658		if bytes.Contains(l, []byte("Generated by Haxe")) {
659			return true
660		}
661	}
662
663	return false
664}
665
666var (
667	doxygenRegex         = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`)
668	htmlMetaRegex        = regex.MustCompile(`<meta(\s+[^>]+)>`)
669	htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`)
670	orgModeMetaRegex     = regex.MustCompile(`org\s+mode`)
671)
672
673func isGeneratedHTML(_, ext string, content []byte) bool {
674	if ext != ".html" && ext != ".htm" && ext != ".xhtml" {
675		return false
676	}
677
678	lines := getLines(content, 30)
679
680	// Pkgdown
681	if len(lines) >= 2 {
682		for _, l := range lines[:2] {
683			if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) {
684				return true
685			}
686		}
687	}
688
689	// Mandoc
690	if len(lines) > 2 &&
691		bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) {
692		return true
693	}
694
695	// Doxygen
696	for _, l := range lines {
697		if doxygenRegex.Match(l) {
698			return true
699		}
700	}
701
702	// HTML tag: <meta name="generator" content="" />
703	part := bytes.ToLower(bytes.Join(lines, []byte{' '}))
704	part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{})
705	part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{})
706	matches := htmlMetaRegex.FindAll(part, -1)
707	if len(matches) == 0 {
708		return false
709	}
710
711	for _, m := range matches {
712		var name, value, content string
713		ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1)
714		for _, m := range ms {
715			switch m[1] {
716			case "name":
717				name = m[2]
718			case "value":
719				value = m[2]
720			case "content":
721				content = m[2]
722			}
723		}
724
725		var val = value
726		if val == "" {
727			val = content
728		}
729
730		name = strings.Trim(name, `"'`)
731		val = strings.Trim(val, `"'`)
732
733		if name != "generator" || val == "" {
734			continue
735		}
736
737		if strings.Contains(val, "jlatex2html") ||
738			strings.Contains(val, "latex2html") ||
739			strings.Contains(val, "groff") ||
740			strings.Contains(val, "makeinfo") ||
741			strings.Contains(val, "texi2html") ||
742			strings.Contains(val, "ronn") ||
743			orgModeMetaRegex.MatchString(val) {
744			return true
745		}
746	}
747
748	return false
749}
750
751func isGeneratedJooq(_, ext string, content []byte) bool {
752	if ext != ".java" {
753		return false
754	}
755
756	for _, l := range getLines(content, 2) {
757		if bytes.Contains(l, []byte("This file is generated by jOOQ.")) {
758			return true
759		}
760	}
761
762	return false
763}
764
765func getFirstLine(content []byte) []byte {
766	lines := getLines(content, 1)
767	if len(lines) > 0 {
768		return lines[0]
769	}
770	return nil
771}
772
773// getLines returns up to the first n lines. A negative index will return up to
774// the last n lines in reverse order.
775func getLines(content []byte, n int) [][]byte {
776	var result [][]byte
777	if n < 0 {
778		for pos := len(content); pos > 0 && len(result) < -n; {
779			nlpos := bytes.LastIndexByte(content[:pos], '\n')
780			if nlpos+1 < len(content)-1 {
781				result = append(result, content[nlpos+1:pos])
782			}
783			pos = nlpos
784		}
785	} else {
786		for pos := 0; pos < len(content) && len(result) < n; {
787			nlpos := bytes.IndexByte(content[pos:], '\n')
788			if nlpos < 0 && pos < len(content) {
789				nlpos = len(content)
790			} else if nlpos >= 0 {
791				nlpos += pos
792			}
793
794			result = append(result, content[pos:nlpos])
795			pos = nlpos + 1
796		}
797	}
798
799	return result
800}
801
802func forEachLine(content []byte, cb func([]byte)) {
803	var pos int
804	for pos < len(content) {
805		nlpos := bytes.IndexByte(content[pos:], '\n')
806		if nlpos < 0 && pos < len(content) {
807			nlpos = len(content)
808		} else if nlpos >= 0 {
809			nlpos += pos
810		}
811
812		cb(content[pos:nlpos])
813		pos = nlpos + 1
814	}
815}
816
817func countAppearancesInLine(line []byte, targets ...string) int {
818	var count int
819	for _, t := range targets {
820		count += bytes.Count(line, []byte(t))
821	}
822	return count
823}
824