1// sift 2// Copyright (C) 2014-2016 Sven Taute 3// 4// This program is free software: you can redistribute it and/or modify 5// it under the terms of the GNU General Public License as published by 6// the Free Software Foundation, version 3 of the License. 7// 8// This program is distributed in the hope that it will be useful, 9// but WITHOUT ANY WARRANTY; without even the implied warranty of 10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11// GNU General Public License for more details. 12// 13// You should have received a copy of the GNU General Public License 14// along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16package main 17 18import ( 19 "bufio" 20 "bytes" 21 "io" 22 "os" 23 "regexp" 24 "sort" 25) 26 27// processReader is the main routine working on an io.Reader 28func processReader(reader io.Reader, matchRegexes []*regexp.Regexp, data []byte, testBuffer []byte, target string) error { 29 var ( 30 bufferOffset int 31 err error 32 isEOF bool 33 lastInputBlockSize int 34 lastMatch *Match 35 lastRoundMultilineWindow bool 36 lastSeekAmount int 37 lastValidMatchRange int 38 linecount int64 = 1 39 matchChan chan Matches 40 matchCount int64 41 offset int64 42 resultIsBinary bool 43 resultStreaming bool 44 validMatchRange int 45 ) 46 matches := make([]Match, 0, 16) 47 conditionMatches := make([]Match, 0, 16) 48 49 for { 50 if isEOF { 51 break 52 } 53 var length int 54 lastConditionMatch := len(conditionMatches) - 1 55 56 if options.Multiline { 57 if lastRoundMultilineWindow { 58 // if the last input block was greater than the sliding window size, that last part has to be processed again 59 copy(data[bufferOffset:bufferOffset+InputMultilineWindow], data[lastInputBlockSize-InputMultilineWindow:lastInputBlockSize]) 60 length, err = reader.Read(data[bufferOffset+InputMultilineWindow:]) 61 length += bufferOffset 62 length += InputMultilineWindow 63 } else { 64 length, err = reader.Read(data[bufferOffset:]) 65 length += bufferOffset 66 } 67 if err != nil { 68 if err == io.EOF { 69 isEOF = true 70 } else { 71 return err 72 } 73 } 74 lastInputBlockSize = length 75 76 // if the input block was greater than the sliding window size, only process a part of it 77 if !isEOF && length > InputMultilineWindow { 78 validMatchRange = length - InputMultilineWindow 79 lastRoundMultilineWindow = true 80 } else { 81 validMatchRange = length 82 lastRoundMultilineWindow = false 83 } 84 } else { 85 // single line mode 86 length, err = reader.Read(data[bufferOffset:]) 87 if err != nil { 88 if err == io.EOF { 89 isEOF = true 90 } else { 91 return err 92 } 93 } 94 length += bufferOffset 95 validMatchRange = length 96 lastInputBlockSize = length 97 } 98 99 lastValidMatchRange = validMatchRange 100 101 // check if file is binary (0x00 found in first 256 bytes) 102 if offset == 0 { 103 s := 256 104 if length < s { 105 s = length 106 } 107 for i := 0; i < s; i++ { 108 if data[i] == 0 { 109 resultIsBinary = true 110 if options.BinarySkip { 111 return nil 112 } 113 break 114 } 115 } 116 } 117 118 // adjust validMatchRange to end on a newline 119 lastSeekAmount = 0 120 if !isEOF { 121 newLineFound := false 122 var pos int 123 for pos = validMatchRange - 1; pos >= 0; pos-- { 124 if data[pos] == '\n' { 125 newLineFound = true 126 break 127 } 128 } 129 if newLineFound { 130 lastSeekAmount = validMatchRange - 1 - pos 131 validMatchRange = validMatchRange - lastSeekAmount 132 bufferOffset = 0 133 } else { 134 if lastInputBlockSize == InputBlockSize { 135 return errLineTooLong 136 } 137 bufferOffset = validMatchRange 138 continue 139 } 140 } 141 142 var testDataPtr []byte 143 if options.IgnoreCase { 144 bytesToLower(data, testBuffer, length) 145 testDataPtr = testBuffer[0:length] 146 } else { 147 testDataPtr = data[0:length] 148 } 149 150 var newMatches Matches 151 for _, re := range matchRegexes { 152 tmpMatches := getMatches(re, data, testDataPtr, offset, length, validMatchRange, 0, target) 153 if len(tmpMatches) > 0 { 154 newMatches = append(newMatches, tmpMatches...) 155 } 156 } 157 158 // sort matches and filter duplicates 159 if len(newMatches) > 0 { 160 sort.Sort(Matches(newMatches)) 161 var validMatch bool 162 prevMatch := lastMatch 163 for i := 0; i < len(newMatches); { 164 validMatch = false 165 if prevMatch == nil { 166 validMatch = true 167 } else { 168 m := newMatches[i] 169 if (!options.Multiline && m.lineEnd > prevMatch.lineEnd) || 170 (options.Multiline && m.start >= prevMatch.end) { 171 validMatch = true 172 } 173 } 174 if validMatch { 175 prevMatch = &newMatches[i] 176 i++ 177 } else { 178 copy(newMatches[i:], newMatches[i+1:]) 179 newMatches = newMatches[0 : len(newMatches)-1] 180 } 181 } 182 } 183 184 for conditionID, condition := range global.conditions { 185 tmpMatches := getMatches(condition.regex, data, testDataPtr, offset, length, validMatchRange, conditionID, target) 186 if len(tmpMatches) > 0 { 187 conditionMatches = append(conditionMatches, tmpMatches...) 188 } 189 } 190 if len(conditionMatches) > 0 { 191 sort.Sort(Matches(conditionMatches)) 192 } 193 194 if options.ShowLineNumbers || options.ContextBefore > 0 || options.ContextAfter > 0 || len(global.conditions) > 0 { 195 linecount = countLines(data, lastConditionMatch, newMatches, conditionMatches, offset, validMatchRange, linecount) 196 } 197 198 if len(newMatches) > 0 { 199 // if a list option is used exit here if possible 200 if (options.FilesWithMatches || options.FilesWithoutMatch) && !options.Count && len(global.conditions) == 0 { 201 global.resultsChan <- &Result{target: target, matches: []Match{Match{}}} 202 return nil 203 } 204 205 lastMatch = &newMatches[len(newMatches)-1] 206 if resultStreaming { 207 matchChan <- newMatches 208 } else { 209 matches = append(matches, newMatches...) 210 if len(matches) > global.streamingThreshold && global.streamingAllowed { 211 resultStreaming = true 212 matchChan = make(chan Matches, 16) 213 global.resultsChan <- &Result{target: target, matches: matches, streaming: true, matchChan: matchChan, isBinary: resultIsBinary} 214 defer func() { 215 close(matchChan) 216 }() 217 } 218 } 219 220 matchCount += int64(len(newMatches)) 221 if options.Limit != 0 && matchCount >= options.Limit { 222 break 223 } 224 } 225 226 // copy the bytes not processed after the last newline to the beginning of the buffer 227 if lastSeekAmount > 0 { 228 copy(data[bufferOffset:bufferOffset+lastSeekAmount], data[lastValidMatchRange-lastSeekAmount:lastValidMatchRange]) 229 bufferOffset += lastSeekAmount 230 } 231 232 offset += int64(validMatchRange) 233 } 234 235 if !resultStreaming { 236 global.resultsChan <- &Result{target: target, matches: matches, conditionMatches: conditionMatches, streaming: false, isBinary: resultIsBinary} 237 } 238 return nil 239} 240 241// getMatches gets all matches in the provided data, it is used for normal and condition matches. 242// 243// data contains the original data. 244// testBuffer contains the data to test the regex against (potentially modified, e.g. to support the ignore case option). 245// length contains the length of the provided data. 246// matches are only valid if they start within the validMatchRange. 247func getMatches(regex *regexp.Regexp, data []byte, testBuffer []byte, offset int64, length int, validMatchRange int, conditionID int, target string) Matches { 248 var matches Matches 249 if allIndex := regex.FindAllIndex(testBuffer, -1); allIndex != nil { 250 // for _, index := range allindex { 251 for mi := 0; mi < len(allIndex); mi++ { 252 index := allIndex[mi] 253 start := index[0] 254 end := index[1] 255 // \s always matches newline, leading to incorrect matches in non-multiline mode 256 // analyze match and reject false matches 257 if !options.Multiline { 258 // remove newlines at the beginning of the match 259 for ; start < length && end > start && data[start] == 0x0a; start++ { 260 } 261 // remove newlines at the end of the match 262 for ; end > 0 && end > start && data[end-1] == 0x0a; end-- { 263 } 264 // check if the corrected match is still valid 265 if !regex.Match(testBuffer[start:end]) { 266 continue 267 } 268 // check if the match contains newlines 269 if bytes.Contains(data[start:end], []byte{0x0a}) { 270 // Rebuild the complete lines to check whether these contain valid matches. 271 // In very rare cases, multiple lines may contain a valid match. As multiple 272 // matches cannot be processed correctly here, requeue them to be processed again. 273 lineStart := start 274 lineEnd := end 275 for lineStart > 0 && data[lineStart-1] != 0x0a { 276 lineStart-- 277 } 278 for lineEnd < length && data[lineEnd] != 0x0a { 279 lineEnd++ 280 } 281 282 lastStart := lineStart 283 for pos := lastStart + 1; pos < lineEnd; pos++ { 284 if data[pos] == 0x0a || pos == lineEnd-1 { 285 if pos == lineEnd-1 && data[pos] != 0x0a { 286 pos++ 287 } 288 if idx := regex.FindIndex(testBuffer[lastStart:pos]); idx != nil { 289 start = lastStart 290 end = pos 291 start = lastStart + idx[0] 292 end = lastStart + idx[1] 293 allIndex = append(allIndex, []int{start, end}) 294 } 295 lastStart = pos + 1 296 } 297 } 298 continue 299 } 300 } 301 302 lineStart := start 303 lineEnd := end 304 if options.Multiline && start >= validMatchRange { 305 continue 306 } 307 for lineStart > 0 && data[lineStart-1] != 0x0a { 308 lineStart-- 309 } 310 for lineEnd < length && data[lineEnd] != 0x0a { 311 lineEnd++ 312 } 313 314 var contextBefore *string 315 var contextAfter *string 316 317 if options.ContextBefore > 0 { 318 var contextBeforeStart int 319 if lineStart > 0 { 320 contextBeforeStart = lineStart - 1 321 precedingLinesFound := 0 322 for contextBeforeStart > 0 { 323 if data[contextBeforeStart-1] == 0x0a { 324 precedingLinesFound++ 325 if precedingLinesFound == options.ContextBefore { 326 break 327 } 328 } 329 contextBeforeStart-- 330 } 331 if precedingLinesFound < options.ContextBefore && contextBeforeStart == 0 && offset > 0 { 332 contextBefore = getBeforeContextFromFile(target, offset, start) 333 } else { 334 tmp := string(data[contextBeforeStart : lineStart-1]) 335 contextBefore = &tmp 336 } 337 } else { 338 if offset > 0 { 339 contextBefore = getBeforeContextFromFile(target, offset, start) 340 } else { 341 contextBefore = nil 342 } 343 } 344 } 345 346 if options.ContextAfter > 0 { 347 var contextAfterEnd int 348 if lineEnd < length-1 { 349 contextAfterEnd = lineEnd 350 followingLinesFound := 0 351 for contextAfterEnd < length-1 { 352 if data[contextAfterEnd+1] == 0x0a { 353 followingLinesFound++ 354 if followingLinesFound == options.ContextAfter { 355 contextAfterEnd++ 356 break 357 } 358 } 359 contextAfterEnd++ 360 } 361 if followingLinesFound < options.ContextAfter && contextAfterEnd == length-1 { 362 contextAfter = getAfterContextFromFile(target, offset, end) 363 } else { 364 tmp := string(data[lineEnd+1 : contextAfterEnd]) 365 contextAfter = &tmp 366 } 367 } else { 368 contextAfter = getAfterContextFromFile(target, offset, end) 369 } 370 } 371 372 m := Match{ 373 conditionID: conditionID, 374 start: offset + int64(start), 375 end: offset + int64(end), 376 lineStart: offset + int64(lineStart), 377 lineEnd: offset + int64(lineEnd), 378 match: string(data[start:end]), 379 line: string(data[lineStart:lineEnd]), 380 contextBefore: contextBefore, 381 contextAfter: contextAfter, 382 } 383 384 // handle special case where '^' matches after the last newline 385 if int(lineStart) != validMatchRange { 386 matches = append(matches, m) 387 } 388 } 389 } 390 return matches 391} 392 393// countLines counts the linebreaks within the given buffer and calculates the correct line numbers for new matches 394func countLines(data []byte, lastConditionMatch int, matches Matches, conditionMatches Matches, offset int64, validMatchRange int, lineCount int64) int64 { 395 currentMatch := 0 396 currentConditionMatch := lastConditionMatch + 1 397 if currentMatch < len(matches) || currentConditionMatch < len(conditionMatches) { 398 for i := 0; i < validMatchRange; i++ { 399 if data[i] == 0xa { 400 for currentMatch < len(matches) && offset+int64(i) >= matches[currentMatch].lineStart { 401 matches[currentMatch].lineno = lineCount 402 currentMatch++ 403 } 404 for currentConditionMatch < len(conditionMatches) && offset+int64(i) >= conditionMatches[currentConditionMatch].lineStart { 405 conditionMatches[currentConditionMatch].lineno = lineCount 406 currentConditionMatch++ 407 } 408 lineCount++ 409 } 410 } 411 // check for matches on last line without newline 412 for currentMatch < len(matches) && offset+int64(validMatchRange) >= matches[currentMatch].lineStart { 413 matches[currentMatch].lineno = lineCount 414 currentMatch++ 415 } 416 for currentConditionMatch < len(conditionMatches) && offset+int64(validMatchRange) >= conditionMatches[currentConditionMatch].lineStart { 417 conditionMatches[currentConditionMatch].lineno = lineCount 418 currentConditionMatch++ 419 } 420 } else { 421 lineCount += int64(countNewlines(data, validMatchRange)) 422 } 423 return lineCount 424} 425 426// applyConditions removes matches from a result that do not fulfill all conditions 427func (result *Result) applyConditions() { 428 if len(result.matches) == 0 || len(global.conditions) == 0 { 429 return 430 } 431 432 // check conditions that are independent of found matches 433 conditionStatus := make([]bool, len(global.conditions)) 434 var conditionFulfilled bool 435 for _, conditionMatch := range result.conditionMatches { 436 conditionFulfilled = false 437 switch global.conditions[conditionMatch.conditionID].conditionType { 438 case ConditionFileMatches: 439 conditionFulfilled = true 440 case ConditionLineMatches: 441 if conditionMatch.lineno == global.conditions[conditionMatch.conditionID].lineRangeStart { 442 conditionFulfilled = true 443 } 444 case ConditionRangeMatches: 445 if conditionMatch.lineno >= global.conditions[conditionMatch.conditionID].lineRangeStart && 446 conditionMatch.lineno <= global.conditions[conditionMatch.conditionID].lineRangeEnd { 447 conditionFulfilled = true 448 } 449 default: 450 // ingore other condition types 451 conditionFulfilled = !global.conditions[conditionMatch.conditionID].negated 452 } 453 if conditionFulfilled { 454 if global.conditions[conditionMatch.conditionID].negated { 455 result.matches = Matches{} 456 return 457 } 458 conditionStatus[conditionMatch.conditionID] = true 459 } 460 } 461 for i := range conditionStatus { 462 if conditionStatus[i] != true && !global.conditions[i].negated { 463 result.matches = Matches{} 464 return 465 } 466 } 467 468MatchLoop: 469 // check for each match whether preceded/followed/surrounded conditions are fulfilled 470 for matchIndex := 0; matchIndex < len(result.matches); { 471 match := result.matches[matchIndex] 472 lineno := match.lineno 473 conditionStatus := make([]bool, len(global.conditions)) 474 for _, conditionMatch := range result.conditionMatches { 475 conditionFulfilled := false 476 maxAllowedDistance := global.conditions[conditionMatch.conditionID].within 477 var actualDistance int64 = -1 478 switch global.conditions[conditionMatch.conditionID].conditionType { 479 case ConditionPreceded: 480 actualDistance = lineno - conditionMatch.lineno 481 if actualDistance == 0 { 482 conditionFulfilled = conditionMatch.start < match.start 483 } else { 484 conditionFulfilled = (actualDistance >= 0) && (maxAllowedDistance == -1 || actualDistance <= maxAllowedDistance) 485 } 486 case ConditionFollowed: 487 actualDistance = conditionMatch.lineno - lineno 488 if actualDistance == 0 { 489 conditionFulfilled = conditionMatch.start > match.start 490 } else { 491 conditionFulfilled = (actualDistance >= 0) && (maxAllowedDistance == -1 || actualDistance <= maxAllowedDistance) 492 } 493 case ConditionSurrounded: 494 if lineno > conditionMatch.lineno { 495 actualDistance = lineno - conditionMatch.lineno 496 } else { 497 actualDistance = conditionMatch.lineno - lineno 498 } 499 if actualDistance == 0 { 500 conditionFulfilled = true 501 } else { 502 conditionFulfilled = (actualDistance >= 0) && (maxAllowedDistance == -1 || actualDistance <= maxAllowedDistance) 503 } 504 default: 505 // ingore other condition types 506 conditionFulfilled = !global.conditions[conditionMatch.conditionID].negated 507 } 508 if conditionFulfilled { 509 if global.conditions[conditionMatch.conditionID].negated { 510 goto ConditionFailed 511 } else { 512 conditionStatus[conditionMatch.conditionID] = true 513 } 514 } 515 } 516 for i := range conditionStatus { 517 if conditionStatus[i] != true && !global.conditions[i].negated { 518 goto ConditionFailed 519 } 520 } 521 matchIndex++ 522 continue MatchLoop 523 524 ConditionFailed: 525 copy(result.matches[matchIndex:], result.matches[matchIndex+1:]) 526 result.matches = result.matches[0 : len(result.matches)-1] 527 } 528} 529 530// getBeforeContextFromFile gets the context lines directly from the file. 531// It is used when the context lines exceed the currently buffered data from the file. 532func getBeforeContextFromFile(target string, offset int64, start int) *string { 533 var contextBeforeStart int 534 infile, _ := os.Open(target) 535 seekPosition := offset + int64(start) - int64(InputBlockSize) 536 if seekPosition < 0 { 537 seekPosition = 0 538 } 539 count := InputBlockSize 540 if offset == 0 && start < InputBlockSize { 541 count = start 542 } 543 infile.Seek(seekPosition, 0) 544 reader := bufio.NewReader(infile) 545 buffer := make([]byte, count) 546 reader.Read(buffer) 547 548 lineStart := len(buffer) 549 for lineStart > 0 && buffer[lineStart-1] != 0x0a { 550 lineStart-- 551 } 552 if lineStart > 0 { 553 contextBeforeStart = lineStart - 1 554 precedingLinesFound := 0 555 for contextBeforeStart > 0 { 556 if buffer[contextBeforeStart-1] == 0x0a { 557 precedingLinesFound++ 558 if precedingLinesFound == options.ContextBefore { 559 break 560 } 561 } 562 contextBeforeStart-- 563 } 564 tmp := string(buffer[contextBeforeStart : lineStart-1]) 565 return &tmp 566 } 567 return nil 568} 569 570// getAfterContextFromFile gets the context lines directly from the file. 571// It is used when the context lines exceed the currently buffered data from the file. 572func getAfterContextFromFile(target string, offset int64, end int) *string { 573 var contextAfterEnd int 574 infile, _ := os.Open(target) 575 seekPosition := offset + int64(end) 576 infile.Seek(seekPosition, 0) 577 reader := bufio.NewReader(infile) 578 buffer := make([]byte, InputBlockSize) 579 length, _ := reader.Read(buffer) 580 581 lineEnd := 0 582 for lineEnd < length && buffer[lineEnd] != 0x0a { 583 lineEnd++ 584 } 585 if lineEnd < length-1 { 586 contextAfterEnd = lineEnd 587 followingLinesFound := 0 588 for contextAfterEnd < length-1 { 589 if buffer[contextAfterEnd+1] == 0x0a { 590 followingLinesFound++ 591 if followingLinesFound == options.ContextAfter { 592 contextAfterEnd++ 593 break 594 } 595 } 596 contextAfterEnd++ 597 } 598 if followingLinesFound < options.ContextAfter && contextAfterEnd == length-1 && buffer[length-1] != 0x0a { 599 contextAfterEnd++ 600 } 601 tmp := string(buffer[lineEnd+1 : contextAfterEnd]) 602 return &tmp 603 } 604 return nil 605} 606 607// processInvertMatchesReader is used to handle the '--invert' option. 608// This function works line based and provides very limited support for options. 609func processReaderInvertMatch(reader io.Reader, matchRegexes []*regexp.Regexp, target string) error { 610 matches := make([]Match, 0, 16) 611 var linecount int64 612 var matchFound bool 613 scanner := bufio.NewScanner(reader) 614 for scanner.Scan() { 615 line := scanner.Text() 616 linecount++ 617 matchFound = false 618 for _, re := range global.matchRegexes { 619 if re.MatchString(line) { 620 matchFound = true 621 } 622 } 623 if !matchFound { 624 if options.FilesWithMatches || options.FilesWithoutMatch { 625 global.resultsChan <- &Result{matches: []Match{Match{}}, target: target} 626 return nil 627 } 628 m := Match{ 629 lineno: linecount, 630 line: line} 631 matches = append(matches, m) 632 633 } 634 } 635 result := &Result{matches: matches, target: target} 636 global.resultsChan <- result 637 return nil 638} 639