1package data 2 3import ( 4 "bytes" 5 "strings" 6 7 "github.com/go-enry/go-enry/v2/regex" 8) 9 10// GeneratedCodeExtensions contains all extensions that belong to generated 11// files for sure. 12var GeneratedCodeExtensions = map[string]struct{}{ 13 // XCode files 14 ".nib": {}, 15 ".xcworkspacedata": {}, 16 ".xcuserstate": {}, 17} 18 19// GeneratedCodeNameMatcher is a function that tells whether the file with the 20// given name is generated. 21type GeneratedCodeNameMatcher func(string) bool 22 23func nameMatches(pattern string) GeneratedCodeNameMatcher { 24 r := regex.MustCompile(pattern) 25 return func(name string) bool { 26 return r.MatchString(name) 27 } 28} 29 30func nameContains(pattern string) GeneratedCodeNameMatcher { 31 return func(name string) bool { 32 return strings.Contains(name, pattern) 33 } 34} 35 36func nameEndsWith(pattern string) GeneratedCodeNameMatcher { 37 return func(name string) bool { 38 return strings.HasSuffix(name, pattern) 39 } 40} 41 42// GeneratedCodeNameMatchers are all the matchers that check whether the code 43// is generated based only on the file name. 44var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{ 45 // Cocoa pods 46 nameMatches(`(^Pods|\/Pods)\/`), 47 48 // Carthage build 49 nameMatches(`(^|\/)Carthage\/Build\/`), 50 51 // NET designer file 52 nameMatches(`(?i)\.designer\.(cs|vb)$`), 53 54 // Generated NET specflow feature file 55 nameEndsWith(".feature.cs"), 56 57 // Node modules 58 nameContains("node_modules/"), 59 60 // Go vendor 61 nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`), 62 63 // Go lock 64 nameEndsWith("Gopkg.lock"), 65 nameEndsWith("glide.lock"), 66 67 // Esy lock 68 nameMatches(`(^|\/)(\w+\.)?esy.lock$`), 69 70 // NPM shrinkwrap 71 nameEndsWith("npm-shrinkwrap.json"), 72 73 // NPM package lock 74 nameEndsWith("package-lock.json"), 75 76 // Yarn plugnplay 77 nameMatches(`(^|\/)\.pnp\.(c|m)?js$`), 78 79 // Godeps 80 nameContains("Godeps/"), 81 82 // Composer lock 83 nameEndsWith("composer.lock"), 84 85 // Generated by zephir 86 nameMatches(`.\.zep\.(?:c|h|php)$`), 87 88 // Cargo lock 89 nameEndsWith("Cargo.lock"), 90 91 // Pipenv lock 92 nameEndsWith("Pipfile.lock"), 93 94 // GraphQL relay 95 nameContains("__generated__/"), 96} 97 98// GeneratedCodeMatcher checks whether the file with the given data is 99// generated code. 100type GeneratedCodeMatcher func(path, ext string, content []byte) bool 101 102// GeneratedCodeMatchers is the list of all generated code matchers that 103// rely on checking the content of the file to make the guess. 104var GeneratedCodeMatchers = []GeneratedCodeMatcher{ 105 isMinifiedFile, 106 hasSourceMapReference, 107 isSourceMap, 108 isCompiledCoffeeScript, 109 isGeneratedNetDocfile, 110 isGeneratedJavaScriptPEGParser, 111 isGeneratedPostScript, 112 isGeneratedGo, 113 isGeneratedProtobuf, 114 isGeneratedJavaScriptProtocolBuffer, 115 isGeneratedApacheThrift, 116 isGeneratedJNIHeader, 117 isVCRCassette, 118 isCompiledCythonFile, 119 isGeneratedModule, 120 isGeneratedUnity3DMeta, 121 isGeneratedRacc, 122 isGeneratedJFlex, 123 isGeneratedGrammarKit, 124 isGeneratedRoxygen2, 125 isGeneratedJison, 126 isGeneratedGRPCCpp, 127 isGeneratedDart, 128 isGeneratedPerlPPPortHeader, 129 isGeneratedGameMakerStudio, 130 isGeneratedGimp, 131 isGeneratedVisualStudio6, 132 isGeneratedHaxe, 133 isGeneratedHTML, 134 isGeneratedJooq, 135} 136 137func canBeMinified(ext string) bool { 138 return ext == ".js" || ext == ".css" 139} 140 141// isMinifiedFile returns whether the file may be minified. 142// We consider a minified file any css or js file whose average number of chars 143// per line is more than 110. 144func isMinifiedFile(path, ext string, content []byte) bool { 145 if !canBeMinified(ext) { 146 return false 147 } 148 149 var chars, lines uint64 150 forEachLine(content, func(line []byte) { 151 chars += uint64(len(line)) 152 lines++ 153 }) 154 155 if lines == 0 { 156 return false 157 } 158 159 return chars/lines > 110 160} 161 162var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`) 163 164// hasSourceMapReference returns whether the file contains a reference to a 165// source-map file. 166func hasSourceMapReference(_ string, ext string, content []byte) bool { 167 if !canBeMinified(ext) { 168 return false 169 } 170 171 for _, line := range getLines(content, -2) { 172 if sourceMapRegex.Match(line) { 173 return true 174 } 175 } 176 177 return false 178} 179 180var sourceMapRegexps = []regex.EnryRegexp{ 181 regex.MustCompile(`^{"version":\d+,`), 182 regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`), 183} 184 185// isSourceMap returns whether the file itself is a source map. 186func isSourceMap(path, _ string, content []byte) bool { 187 if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") { 188 return true 189 } 190 191 firstLine := getFirstLine(content) 192 if len(firstLine) == 0 { 193 return false 194 } 195 196 for _, r := range sourceMapRegexps { 197 if r.Match(firstLine) { 198 return true 199 } 200 } 201 202 return false 203} 204 205func isCompiledCoffeeScript(path, ext string, content []byte) bool { 206 if ext != ".js" { 207 return false 208 } 209 210 firstLine := getFirstLine(content) 211 lastLines := getLines(content, -2) 212 if len(lastLines) < 2 { 213 return false 214 } 215 216 if string(firstLine) == "(function() {" && 217 string(lastLines[1]) == "}).call(this);" && 218 string(lastLines[0]) == "" { 219 score := 0 220 221 forEachLine(content, func(line []byte) { 222 if bytes.Contains(line, []byte("var ")) { 223 // Underscored temp vars are likely to be Coffee 224 score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results") 225 226 // bind and extend functions are very Coffee specific 227 score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice") 228 } 229 }) 230 231 // Require a score of 3. This is fairly abritrary. Consider tweaking later. 232 // See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213 233 return score >= 3 234 } 235 236 return false 237} 238 239func isGeneratedNetDocfile(_, ext string, content []byte) bool { 240 if ext != ".xml" { 241 return false 242 } 243 244 lines := bytes.Split(content, []byte{'\n'}) 245 if len(lines) <= 3 { 246 return false 247 } 248 249 return bytes.Contains(lines[1], []byte("<doc>")) && 250 bytes.Contains(lines[2], []byte("<assembly>")) && 251 bytes.Contains(lines[len(lines)-2], []byte("</doc>")) 252} 253 254var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`) 255 256func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool { 257 if ext != ".js" { 258 return false 259 } 260 261 // PEG.js-generated parsers include a comment near the top of the file 262 // that marks them as such. 263 return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte(""))) 264} 265 266var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`) 267 268var postScriptRegexes = []regex.EnryRegexp{ 269 regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`), 270 regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`), 271} 272 273func isGeneratedPostScript(_, ext string, content []byte) bool { 274 if ext != ".ps" && ext != ".eps" && ext != ".pfa" { 275 return false 276 } 277 278 // Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these 279 // streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42). 280 if postScriptType1And42Regex.Match(content) { 281 return true 282 } 283 284 // We analyze the "%%Creator:" comment, which contains the author/generator 285 // of the file. If there is one, it should be in one of the first few lines. 286 var creator []byte 287 for _, line := range getLines(content, 10) { 288 if bytes.HasPrefix(line, []byte("%%Creator: ")) { 289 creator = line 290 break 291 } 292 } 293 294 if len(creator) == 0 { 295 return false 296 } 297 298 // EAGLE doesn't include a version number when it generates PostScript. 299 // However, it does prepend its name to the document's "%%Title" field. 300 if bytes.Contains(creator, []byte("EAGLE")) { 301 for _, line := range getLines(content, 5) { 302 if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) { 303 return true 304 } 305 } 306 } 307 308 // Most generators write their version number, while human authors' or companies' 309 // names don't contain numbers. So look if the line contains digits. Also 310 // look for some special cases without version numbers. 311 for _, r := range postScriptRegexes { 312 if r.Match(creator) { 313 return true 314 } 315 } 316 317 return false 318} 319 320func isGeneratedGo(_, ext string, content []byte) bool { 321 if ext != ".go" { 322 return false 323 } 324 325 lines := getLines(content, 40) 326 if len(lines) <= 1 { 327 return false 328 } 329 330 for _, line := range lines { 331 if bytes.Contains(line, []byte("Code generated by")) { 332 return true 333 } 334 } 335 336 return false 337} 338 339var protoExtensions = map[string]struct{}{ 340 ".py": {}, 341 ".java": {}, 342 ".h": {}, 343 ".cc": {}, 344 ".cpp": {}, 345 ".m": {}, 346 ".rb": {}, 347 ".php": {}, 348} 349 350func isGeneratedProtobuf(_, ext string, content []byte) bool { 351 if _, ok := protoExtensions[ext]; !ok { 352 return false 353 } 354 355 lines := getLines(content, 3) 356 if len(lines) <= 1 { 357 return false 358 } 359 360 for _, line := range lines { 361 if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) { 362 return true 363 } 364 } 365 366 return false 367} 368 369func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool { 370 if ext != ".js" { 371 return false 372 } 373 374 lines := getLines(content, 6) 375 if len(lines) < 6 { 376 return false 377 } 378 379 return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!")) 380} 381 382var apacheThriftExtensions = map[string]struct{}{ 383 ".rb": {}, 384 ".py": {}, 385 ".go": {}, 386 ".js": {}, 387 ".m": {}, 388 ".java": {}, 389 ".h": {}, 390 ".cc": {}, 391 ".cpp": {}, 392 ".php": {}, 393} 394 395func isGeneratedApacheThrift(_, ext string, content []byte) bool { 396 if _, ok := apacheThriftExtensions[ext]; !ok { 397 return false 398 } 399 400 for _, line := range getLines(content, 6) { 401 if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) { 402 return true 403 } 404 } 405 406 return false 407} 408 409func isGeneratedJNIHeader(_, ext string, content []byte) bool { 410 if ext != ".h" { 411 return false 412 } 413 414 lines := getLines(content, 2) 415 if len(lines) < 2 { 416 return false 417 } 418 419 return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) && 420 bytes.Contains(lines[1], []byte("#include <jni.h>")) 421} 422 423func isVCRCassette(_, ext string, content []byte) bool { 424 if ext != ".yml" { 425 return false 426 } 427 428 lines := getLines(content, -2) 429 if len(lines) < 2 { 430 return false 431 } 432 433 return bytes.Contains(lines[1], []byte("recorded_with: VCR")) 434} 435 436func isCompiledCythonFile(_, ext string, content []byte) bool { 437 if ext != ".c" && ext != ".cpp" { 438 return false 439 } 440 441 lines := getLines(content, 1) 442 if len(lines) < 1 { 443 return false 444 } 445 446 return bytes.Contains(lines[0], []byte("Generated by Cython")) 447} 448 449func isGeneratedModule(_, ext string, content []byte) bool { 450 if ext != ".mod" { 451 return false 452 } 453 454 lines := getLines(content, 1) 455 if len(lines) < 1 { 456 return false 457 } 458 459 return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) || 460 bytes.Contains(lines[0], []byte("GFORTRAN module version '")) 461} 462 463func isGeneratedUnity3DMeta(_, ext string, content []byte) bool { 464 if ext != ".meta" { 465 return false 466 } 467 468 lines := getLines(content, 1) 469 if len(lines) < 1 { 470 return false 471 } 472 473 return bytes.Contains(lines[0], []byte("fileFormatVersion: ")) 474} 475 476func isGeneratedRacc(_, ext string, content []byte) bool { 477 if ext != ".rb" { 478 return false 479 } 480 481 lines := getLines(content, 3) 482 if len(lines) < 3 { 483 return false 484 } 485 486 return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc")) 487} 488 489func isGeneratedJFlex(_, ext string, content []byte) bool { 490 if ext != ".java" { 491 return false 492 } 493 494 lines := getLines(content, 1) 495 if len(lines) < 1 { 496 return false 497 } 498 499 return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex ")) 500} 501 502func isGeneratedGrammarKit(_, ext string, content []byte) bool { 503 if ext != ".java" { 504 return false 505 } 506 507 lines := getLines(content, 1) 508 if len(lines) < 1 { 509 return false 510 } 511 512 return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing.")) 513} 514 515func isGeneratedRoxygen2(_, ext string, content []byte) bool { 516 if ext != ".rd" { 517 return false 518 } 519 520 lines := getLines(content, 1) 521 if len(lines) < 1 { 522 return false 523 } 524 525 return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand")) 526} 527 528func isGeneratedJison(_, ext string, content []byte) bool { 529 if ext != ".js" { 530 return false 531 } 532 533 lines := getLines(content, 1) 534 if len(lines) < 1 { 535 return false 536 } 537 538 return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) || 539 bytes.Contains(lines[0], []byte("/* generated by jison-lex ")) 540} 541 542func isGeneratedGRPCCpp(_, ext string, content []byte) bool { 543 switch ext { 544 case ".cpp", ".hpp", ".h", ".cc": 545 lines := getLines(content, 1) 546 if len(lines) < 1 { 547 return false 548 } 549 550 return bytes.Contains(lines[0], []byte("// Generated by the gRPC")) 551 default: 552 return false 553 } 554} 555 556var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`) 557 558func isGeneratedDart(_, ext string, content []byte) bool { 559 if ext != ".dart" { 560 return false 561 } 562 563 lines := getLines(content, 1) 564 if len(lines) < 1 { 565 return false 566 } 567 568 return dartRegex.Match(bytes.ToLower(lines[0])) 569} 570 571func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool { 572 if !strings.HasSuffix(name, "ppport.h") { 573 return false 574 } 575 576 lines := getLines(content, 10) 577 if len(lines) < 10 { 578 return false 579 } 580 581 return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort")) 582} 583 584var ( 585 gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`) 586 gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`) 587) 588 589func isGeneratedGameMakerStudio(_, ext string, content []byte) bool { 590 if ext != ".yy" && ext != ".yyp" { 591 return false 592 } 593 594 lines := getLines(content, 3) 595 if len(lines) < 3 { 596 return false 597 } 598 599 return gameMakerStudioThirdLineRegex.Match(lines[2]) || 600 gameMakerStudioFirstLineRegex.Match(lines[0]) 601} 602 603var gimpRegexes = []regex.EnryRegexp{ 604 regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`), 605 regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`), 606} 607 608func isGeneratedGimp(_, ext string, content []byte) bool { 609 if ext != ".c" && ext != ".h" { 610 return false 611 } 612 613 lines := getLines(content, 1) 614 if len(lines) < 1 { 615 return false 616 } 617 618 for _, r := range gimpRegexes { 619 if r.Match(lines[0]) { 620 return true 621 } 622 } 623 624 return false 625} 626 627func isGeneratedVisualStudio6(_, ext string, content []byte) bool { 628 if ext != ".dsp" { 629 return false 630 } 631 632 for _, l := range getLines(content, 3) { 633 if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) { 634 return true 635 } 636 } 637 638 return false 639} 640 641var haxeExtensions = map[string]struct{}{ 642 ".js": {}, 643 ".py": {}, 644 ".lua": {}, 645 ".cpp": {}, 646 ".h": {}, 647 ".java": {}, 648 ".cs": {}, 649 ".php": {}, 650} 651 652func isGeneratedHaxe(_, ext string, content []byte) bool { 653 if _, ok := haxeExtensions[ext]; !ok { 654 return false 655 } 656 657 for _, l := range getLines(content, 3) { 658 if bytes.Contains(l, []byte("Generated by Haxe")) { 659 return true 660 } 661 } 662 663 return false 664} 665 666var ( 667 doxygenRegex = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`) 668 htmlMetaRegex = regex.MustCompile(`<meta(\s+[^>]+)>`) 669 htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`) 670 orgModeMetaRegex = regex.MustCompile(`org\s+mode`) 671) 672 673func isGeneratedHTML(_, ext string, content []byte) bool { 674 if ext != ".html" && ext != ".htm" && ext != ".xhtml" { 675 return false 676 } 677 678 lines := getLines(content, 30) 679 680 // Pkgdown 681 if len(lines) >= 2 { 682 for _, l := range lines[:2] { 683 if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) { 684 return true 685 } 686 } 687 } 688 689 // Mandoc 690 if len(lines) > 2 && 691 bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) { 692 return true 693 } 694 695 // Doxygen 696 for _, l := range lines { 697 if doxygenRegex.Match(l) { 698 return true 699 } 700 } 701 702 // HTML tag: <meta name="generator" content="" /> 703 part := bytes.ToLower(bytes.Join(lines, []byte{' '})) 704 part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{}) 705 part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{}) 706 matches := htmlMetaRegex.FindAll(part, -1) 707 if len(matches) == 0 { 708 return false 709 } 710 711 for _, m := range matches { 712 var name, value, content string 713 ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1) 714 for _, m := range ms { 715 switch m[1] { 716 case "name": 717 name = m[2] 718 case "value": 719 value = m[2] 720 case "content": 721 content = m[2] 722 } 723 } 724 725 var val = value 726 if val == "" { 727 val = content 728 } 729 730 name = strings.Trim(name, `"'`) 731 val = strings.Trim(val, `"'`) 732 733 if name != "generator" || val == "" { 734 continue 735 } 736 737 if strings.Contains(val, "jlatex2html") || 738 strings.Contains(val, "latex2html") || 739 strings.Contains(val, "groff") || 740 strings.Contains(val, "makeinfo") || 741 strings.Contains(val, "texi2html") || 742 strings.Contains(val, "ronn") || 743 orgModeMetaRegex.MatchString(val) { 744 return true 745 } 746 } 747 748 return false 749} 750 751func isGeneratedJooq(_, ext string, content []byte) bool { 752 if ext != ".java" { 753 return false 754 } 755 756 for _, l := range getLines(content, 2) { 757 if bytes.Contains(l, []byte("This file is generated by jOOQ.")) { 758 return true 759 } 760 } 761 762 return false 763} 764 765func getFirstLine(content []byte) []byte { 766 lines := getLines(content, 1) 767 if len(lines) > 0 { 768 return lines[0] 769 } 770 return nil 771} 772 773// getLines returns up to the first n lines. A negative index will return up to 774// the last n lines in reverse order. 775func getLines(content []byte, n int) [][]byte { 776 var result [][]byte 777 if n < 0 { 778 for pos := len(content); pos > 0 && len(result) < -n; { 779 nlpos := bytes.LastIndexByte(content[:pos], '\n') 780 if nlpos+1 < len(content)-1 { 781 result = append(result, content[nlpos+1:pos]) 782 } 783 pos = nlpos 784 } 785 } else { 786 for pos := 0; pos < len(content) && len(result) < n; { 787 nlpos := bytes.IndexByte(content[pos:], '\n') 788 if nlpos < 0 && pos < len(content) { 789 nlpos = len(content) 790 } else if nlpos >= 0 { 791 nlpos += pos 792 } 793 794 result = append(result, content[pos:nlpos]) 795 pos = nlpos + 1 796 } 797 } 798 799 return result 800} 801 802func forEachLine(content []byte, cb func([]byte)) { 803 var pos int 804 for pos < len(content) { 805 nlpos := bytes.IndexByte(content[pos:], '\n') 806 if nlpos < 0 && pos < len(content) { 807 nlpos = len(content) 808 } else if nlpos >= 0 { 809 nlpos += pos 810 } 811 812 cb(content[pos:nlpos]) 813 pos = nlpos + 1 814 } 815} 816 817func countAppearancesInLine(line []byte, targets ...string) int { 818 var count int 819 for _, t := range targets { 820 count += bytes.Count(line, []byte(t)) 821 } 822 return count 823} 824