1// sift
2// Copyright (C) 2014-2016 Sven Taute
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3 of the License.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
16package main
17
18import (
19	"bufio"
20	"bytes"
21	"io"
22	"os"
23	"regexp"
24	"sort"
25)
26
27// processReader is the main routine working on an io.Reader
28func processReader(reader io.Reader, matchRegexes []*regexp.Regexp, data []byte, testBuffer []byte, target string) error {
29	var (
30		bufferOffset             int
31		err                      error
32		isEOF                    bool
33		lastInputBlockSize       int
34		lastMatch                *Match
35		lastRoundMultilineWindow bool
36		lastSeekAmount           int
37		lastValidMatchRange      int
38		linecount                int64 = 1
39		matchChan                chan Matches
40		matchCount               int64
41		offset                   int64
42		resultIsBinary           bool
43		resultStreaming          bool
44		validMatchRange          int
45	)
46	matches := make([]Match, 0, 16)
47	conditionMatches := make([]Match, 0, 16)
48
49	for {
50		if isEOF {
51			break
52		}
53		var length int
54		lastConditionMatch := len(conditionMatches) - 1
55
56		if options.Multiline {
57			if lastRoundMultilineWindow {
58				// if the last input block was greater than the sliding window size, that last part has to be processed again
59				copy(data[bufferOffset:bufferOffset+InputMultilineWindow], data[lastInputBlockSize-InputMultilineWindow:lastInputBlockSize])
60				length, err = reader.Read(data[bufferOffset+InputMultilineWindow:])
61				length += bufferOffset
62				length += InputMultilineWindow
63			} else {
64				length, err = reader.Read(data[bufferOffset:])
65				length += bufferOffset
66			}
67			if err != nil {
68				if err == io.EOF {
69					isEOF = true
70				} else {
71					return err
72				}
73			}
74			lastInputBlockSize = length
75
76			// if the input block was greater than the sliding window size, only process a part of it
77			if !isEOF && length > InputMultilineWindow {
78				validMatchRange = length - InputMultilineWindow
79				lastRoundMultilineWindow = true
80			} else {
81				validMatchRange = length
82				lastRoundMultilineWindow = false
83			}
84		} else {
85			// single line mode
86			length, err = reader.Read(data[bufferOffset:])
87			if err != nil {
88				if err == io.EOF {
89					isEOF = true
90				} else {
91					return err
92				}
93			}
94			length += bufferOffset
95			validMatchRange = length
96			lastInputBlockSize = length
97		}
98
99		lastValidMatchRange = validMatchRange
100
101		// check if file is binary (0x00 found in first 256 bytes)
102		if offset == 0 {
103			s := 256
104			if length < s {
105				s = length
106			}
107			for i := 0; i < s; i++ {
108				if data[i] == 0 {
109					resultIsBinary = true
110					if options.BinarySkip {
111						return nil
112					}
113					break
114				}
115			}
116		}
117
118		// adjust validMatchRange to end on a newline
119		lastSeekAmount = 0
120		if !isEOF {
121			newLineFound := false
122			var pos int
123			for pos = validMatchRange - 1; pos >= 0; pos-- {
124				if data[pos] == '\n' {
125					newLineFound = true
126					break
127				}
128			}
129			if newLineFound {
130				lastSeekAmount = validMatchRange - 1 - pos
131				validMatchRange = validMatchRange - lastSeekAmount
132				bufferOffset = 0
133			} else {
134				if lastInputBlockSize == InputBlockSize {
135					return errLineTooLong
136				}
137				bufferOffset = validMatchRange
138				continue
139			}
140		}
141
142		var testDataPtr []byte
143		if options.IgnoreCase {
144			bytesToLower(data, testBuffer, length)
145			testDataPtr = testBuffer[0:length]
146		} else {
147			testDataPtr = data[0:length]
148		}
149
150		var newMatches Matches
151		for _, re := range matchRegexes {
152			tmpMatches := getMatches(re, data, testDataPtr, offset, length, validMatchRange, 0, target)
153			if len(tmpMatches) > 0 {
154				newMatches = append(newMatches, tmpMatches...)
155			}
156		}
157
158		// sort matches and filter duplicates
159		if len(newMatches) > 0 {
160			sort.Sort(Matches(newMatches))
161			var validMatch bool
162			prevMatch := lastMatch
163			for i := 0; i < len(newMatches); {
164				validMatch = false
165				if prevMatch == nil {
166					validMatch = true
167				} else {
168					m := newMatches[i]
169					if (!options.Multiline && m.lineEnd > prevMatch.lineEnd) ||
170						(options.Multiline && m.start >= prevMatch.end) {
171						validMatch = true
172					}
173				}
174				if validMatch {
175					prevMatch = &newMatches[i]
176					i++
177				} else {
178					copy(newMatches[i:], newMatches[i+1:])
179					newMatches = newMatches[0 : len(newMatches)-1]
180				}
181			}
182		}
183
184		for conditionID, condition := range global.conditions {
185			tmpMatches := getMatches(condition.regex, data, testDataPtr, offset, length, validMatchRange, conditionID, target)
186			if len(tmpMatches) > 0 {
187				conditionMatches = append(conditionMatches, tmpMatches...)
188			}
189		}
190		if len(conditionMatches) > 0 {
191			sort.Sort(Matches(conditionMatches))
192		}
193
194		if options.ShowLineNumbers || options.ContextBefore > 0 || options.ContextAfter > 0 || len(global.conditions) > 0 {
195			linecount = countLines(data, lastConditionMatch, newMatches, conditionMatches, offset, validMatchRange, linecount)
196		}
197
198		if len(newMatches) > 0 {
199			// if a list option is used exit here if possible
200			if (options.FilesWithMatches || options.FilesWithoutMatch) && !options.Count && len(global.conditions) == 0 {
201				global.resultsChan <- &Result{target: target, matches: []Match{Match{}}}
202				return nil
203			}
204
205			lastMatch = &newMatches[len(newMatches)-1]
206			if resultStreaming {
207				matchChan <- newMatches
208			} else {
209				matches = append(matches, newMatches...)
210				if len(matches) > global.streamingThreshold && global.streamingAllowed {
211					resultStreaming = true
212					matchChan = make(chan Matches, 16)
213					global.resultsChan <- &Result{target: target, matches: matches, streaming: true, matchChan: matchChan, isBinary: resultIsBinary}
214					defer func() {
215						close(matchChan)
216					}()
217				}
218			}
219
220			matchCount += int64(len(newMatches))
221			if options.Limit != 0 && matchCount >= options.Limit {
222				break
223			}
224		}
225
226		// copy the bytes not processed after the last newline to the beginning of the buffer
227		if lastSeekAmount > 0 {
228			copy(data[bufferOffset:bufferOffset+lastSeekAmount], data[lastValidMatchRange-lastSeekAmount:lastValidMatchRange])
229			bufferOffset += lastSeekAmount
230		}
231
232		offset += int64(validMatchRange)
233	}
234
235	if !resultStreaming {
236		global.resultsChan <- &Result{target: target, matches: matches, conditionMatches: conditionMatches, streaming: false, isBinary: resultIsBinary}
237	}
238	return nil
239}
240
241// getMatches gets all matches in the provided data, it is used for normal and condition matches.
242//
243// data contains the original data.
244// testBuffer contains the data to test the regex against (potentially modified, e.g. to support the ignore case option).
245// length contains the length of the provided data.
246// matches are only valid if they start within the validMatchRange.
247func getMatches(regex *regexp.Regexp, data []byte, testBuffer []byte, offset int64, length int, validMatchRange int, conditionID int, target string) Matches {
248	var matches Matches
249	if allIndex := regex.FindAllIndex(testBuffer, -1); allIndex != nil {
250		// for _, index := range allindex {
251		for mi := 0; mi < len(allIndex); mi++ {
252			index := allIndex[mi]
253			start := index[0]
254			end := index[1]
255			// \s always matches newline, leading to incorrect matches in non-multiline mode
256			// analyze match and reject false matches
257			if !options.Multiline {
258				// remove newlines at the beginning of the match
259				for ; start < length && end > start && data[start] == 0x0a; start++ {
260				}
261				// remove newlines at the end of the match
262				for ; end > 0 && end > start && data[end-1] == 0x0a; end-- {
263				}
264				// check if the corrected match is still valid
265				if !regex.Match(testBuffer[start:end]) {
266					continue
267				}
268				// check if the match contains newlines
269				if bytes.Contains(data[start:end], []byte{0x0a}) {
270					// Rebuild the complete lines to check whether these contain valid matches.
271					// In very rare cases, multiple lines may contain a valid match. As multiple
272					// matches cannot be processed correctly here, requeue them to be processed again.
273					lineStart := start
274					lineEnd := end
275					for lineStart > 0 && data[lineStart-1] != 0x0a {
276						lineStart--
277					}
278					for lineEnd < length && data[lineEnd] != 0x0a {
279						lineEnd++
280					}
281
282					lastStart := lineStart
283					for pos := lastStart + 1; pos < lineEnd; pos++ {
284						if data[pos] == 0x0a || pos == lineEnd-1 {
285							if pos == lineEnd-1 && data[pos] != 0x0a {
286								pos++
287							}
288							if idx := regex.FindIndex(testBuffer[lastStart:pos]); idx != nil {
289								start = lastStart
290								end = pos
291								start = lastStart + idx[0]
292								end = lastStart + idx[1]
293								allIndex = append(allIndex, []int{start, end})
294							}
295							lastStart = pos + 1
296						}
297					}
298					continue
299				}
300			}
301
302			lineStart := start
303			lineEnd := end
304			if options.Multiline && start >= validMatchRange {
305				continue
306			}
307			for lineStart > 0 && data[lineStart-1] != 0x0a {
308				lineStart--
309			}
310			for lineEnd < length && data[lineEnd] != 0x0a {
311				lineEnd++
312			}
313
314			var contextBefore *string
315			var contextAfter *string
316
317			if options.ContextBefore > 0 {
318				var contextBeforeStart int
319				if lineStart > 0 {
320					contextBeforeStart = lineStart - 1
321					precedingLinesFound := 0
322					for contextBeforeStart > 0 {
323						if data[contextBeforeStart-1] == 0x0a {
324							precedingLinesFound++
325							if precedingLinesFound == options.ContextBefore {
326								break
327							}
328						}
329						contextBeforeStart--
330					}
331					if precedingLinesFound < options.ContextBefore && contextBeforeStart == 0 && offset > 0 {
332						contextBefore = getBeforeContextFromFile(target, offset, start)
333					} else {
334						tmp := string(data[contextBeforeStart : lineStart-1])
335						contextBefore = &tmp
336					}
337				} else {
338					if offset > 0 {
339						contextBefore = getBeforeContextFromFile(target, offset, start)
340					} else {
341						contextBefore = nil
342					}
343				}
344			}
345
346			if options.ContextAfter > 0 {
347				var contextAfterEnd int
348				if lineEnd < length-1 {
349					contextAfterEnd = lineEnd
350					followingLinesFound := 0
351					for contextAfterEnd < length-1 {
352						if data[contextAfterEnd+1] == 0x0a {
353							followingLinesFound++
354							if followingLinesFound == options.ContextAfter {
355								contextAfterEnd++
356								break
357							}
358						}
359						contextAfterEnd++
360					}
361					if followingLinesFound < options.ContextAfter && contextAfterEnd == length-1 {
362						contextAfter = getAfterContextFromFile(target, offset, end)
363					} else {
364						tmp := string(data[lineEnd+1 : contextAfterEnd])
365						contextAfter = &tmp
366					}
367				} else {
368					contextAfter = getAfterContextFromFile(target, offset, end)
369				}
370			}
371
372			m := Match{
373				conditionID:   conditionID,
374				start:         offset + int64(start),
375				end:           offset + int64(end),
376				lineStart:     offset + int64(lineStart),
377				lineEnd:       offset + int64(lineEnd),
378				match:         string(data[start:end]),
379				line:          string(data[lineStart:lineEnd]),
380				contextBefore: contextBefore,
381				contextAfter:  contextAfter,
382			}
383
384			// handle special case where '^' matches after the last newline
385			if int(lineStart) != validMatchRange {
386				matches = append(matches, m)
387			}
388		}
389	}
390	return matches
391}
392
393// countLines counts the linebreaks within the given buffer and calculates the correct line numbers for new matches
394func countLines(data []byte, lastConditionMatch int, matches Matches, conditionMatches Matches, offset int64, validMatchRange int, lineCount int64) int64 {
395	currentMatch := 0
396	currentConditionMatch := lastConditionMatch + 1
397	if currentMatch < len(matches) || currentConditionMatch < len(conditionMatches) {
398		for i := 0; i < validMatchRange; i++ {
399			if data[i] == 0xa {
400				for currentMatch < len(matches) && offset+int64(i) >= matches[currentMatch].lineStart {
401					matches[currentMatch].lineno = lineCount
402					currentMatch++
403				}
404				for currentConditionMatch < len(conditionMatches) && offset+int64(i) >= conditionMatches[currentConditionMatch].lineStart {
405					conditionMatches[currentConditionMatch].lineno = lineCount
406					currentConditionMatch++
407				}
408				lineCount++
409			}
410		}
411		// check for matches on last line without newline
412		for currentMatch < len(matches) && offset+int64(validMatchRange) >= matches[currentMatch].lineStart {
413			matches[currentMatch].lineno = lineCount
414			currentMatch++
415		}
416		for currentConditionMatch < len(conditionMatches) && offset+int64(validMatchRange) >= conditionMatches[currentConditionMatch].lineStart {
417			conditionMatches[currentConditionMatch].lineno = lineCount
418			currentConditionMatch++
419		}
420	} else {
421		lineCount += int64(countNewlines(data, validMatchRange))
422	}
423	return lineCount
424}
425
426// applyConditions removes matches from a result that do not fulfill all conditions
427func (result *Result) applyConditions() {
428	if len(result.matches) == 0 || len(global.conditions) == 0 {
429		return
430	}
431
432	// check conditions that are independent of found matches
433	conditionStatus := make([]bool, len(global.conditions))
434	var conditionFulfilled bool
435	for _, conditionMatch := range result.conditionMatches {
436		conditionFulfilled = false
437		switch global.conditions[conditionMatch.conditionID].conditionType {
438		case ConditionFileMatches:
439			conditionFulfilled = true
440		case ConditionLineMatches:
441			if conditionMatch.lineno == global.conditions[conditionMatch.conditionID].lineRangeStart {
442				conditionFulfilled = true
443			}
444		case ConditionRangeMatches:
445			if conditionMatch.lineno >= global.conditions[conditionMatch.conditionID].lineRangeStart &&
446				conditionMatch.lineno <= global.conditions[conditionMatch.conditionID].lineRangeEnd {
447				conditionFulfilled = true
448			}
449		default:
450			// ingore other condition types
451			conditionFulfilled = !global.conditions[conditionMatch.conditionID].negated
452		}
453		if conditionFulfilled {
454			if global.conditions[conditionMatch.conditionID].negated {
455				result.matches = Matches{}
456				return
457			}
458			conditionStatus[conditionMatch.conditionID] = true
459		}
460	}
461	for i := range conditionStatus {
462		if conditionStatus[i] != true && !global.conditions[i].negated {
463			result.matches = Matches{}
464			return
465		}
466	}
467
468MatchLoop:
469	// check for each match whether preceded/followed/surrounded conditions are fulfilled
470	for matchIndex := 0; matchIndex < len(result.matches); {
471		match := result.matches[matchIndex]
472		lineno := match.lineno
473		conditionStatus := make([]bool, len(global.conditions))
474		for _, conditionMatch := range result.conditionMatches {
475			conditionFulfilled := false
476			maxAllowedDistance := global.conditions[conditionMatch.conditionID].within
477			var actualDistance int64 = -1
478			switch global.conditions[conditionMatch.conditionID].conditionType {
479			case ConditionPreceded:
480				actualDistance = lineno - conditionMatch.lineno
481				if actualDistance == 0 {
482					conditionFulfilled = conditionMatch.start < match.start
483				} else {
484					conditionFulfilled = (actualDistance >= 0) && (maxAllowedDistance == -1 || actualDistance <= maxAllowedDistance)
485				}
486			case ConditionFollowed:
487				actualDistance = conditionMatch.lineno - lineno
488				if actualDistance == 0 {
489					conditionFulfilled = conditionMatch.start > match.start
490				} else {
491					conditionFulfilled = (actualDistance >= 0) && (maxAllowedDistance == -1 || actualDistance <= maxAllowedDistance)
492				}
493			case ConditionSurrounded:
494				if lineno > conditionMatch.lineno {
495					actualDistance = lineno - conditionMatch.lineno
496				} else {
497					actualDistance = conditionMatch.lineno - lineno
498				}
499				if actualDistance == 0 {
500					conditionFulfilled = true
501				} else {
502					conditionFulfilled = (actualDistance >= 0) && (maxAllowedDistance == -1 || actualDistance <= maxAllowedDistance)
503				}
504			default:
505				// ingore other condition types
506				conditionFulfilled = !global.conditions[conditionMatch.conditionID].negated
507			}
508			if conditionFulfilled {
509				if global.conditions[conditionMatch.conditionID].negated {
510					goto ConditionFailed
511				} else {
512					conditionStatus[conditionMatch.conditionID] = true
513				}
514			}
515		}
516		for i := range conditionStatus {
517			if conditionStatus[i] != true && !global.conditions[i].negated {
518				goto ConditionFailed
519			}
520		}
521		matchIndex++
522		continue MatchLoop
523
524	ConditionFailed:
525		copy(result.matches[matchIndex:], result.matches[matchIndex+1:])
526		result.matches = result.matches[0 : len(result.matches)-1]
527	}
528}
529
530// getBeforeContextFromFile gets the context lines directly from the file.
531// It is used when the context lines exceed the currently buffered data from the file.
532func getBeforeContextFromFile(target string, offset int64, start int) *string {
533	var contextBeforeStart int
534	infile, _ := os.Open(target)
535	seekPosition := offset + int64(start) - int64(InputBlockSize)
536	if seekPosition < 0 {
537		seekPosition = 0
538	}
539	count := InputBlockSize
540	if offset == 0 && start < InputBlockSize {
541		count = start
542	}
543	infile.Seek(seekPosition, 0)
544	reader := bufio.NewReader(infile)
545	buffer := make([]byte, count)
546	reader.Read(buffer)
547
548	lineStart := len(buffer)
549	for lineStart > 0 && buffer[lineStart-1] != 0x0a {
550		lineStart--
551	}
552	if lineStart > 0 {
553		contextBeforeStart = lineStart - 1
554		precedingLinesFound := 0
555		for contextBeforeStart > 0 {
556			if buffer[contextBeforeStart-1] == 0x0a {
557				precedingLinesFound++
558				if precedingLinesFound == options.ContextBefore {
559					break
560				}
561			}
562			contextBeforeStart--
563		}
564		tmp := string(buffer[contextBeforeStart : lineStart-1])
565		return &tmp
566	}
567	return nil
568}
569
570// getAfterContextFromFile gets the context lines directly from the file.
571// It is used when the context lines exceed the currently buffered data from the file.
572func getAfterContextFromFile(target string, offset int64, end int) *string {
573	var contextAfterEnd int
574	infile, _ := os.Open(target)
575	seekPosition := offset + int64(end)
576	infile.Seek(seekPosition, 0)
577	reader := bufio.NewReader(infile)
578	buffer := make([]byte, InputBlockSize)
579	length, _ := reader.Read(buffer)
580
581	lineEnd := 0
582	for lineEnd < length && buffer[lineEnd] != 0x0a {
583		lineEnd++
584	}
585	if lineEnd < length-1 {
586		contextAfterEnd = lineEnd
587		followingLinesFound := 0
588		for contextAfterEnd < length-1 {
589			if buffer[contextAfterEnd+1] == 0x0a {
590				followingLinesFound++
591				if followingLinesFound == options.ContextAfter {
592					contextAfterEnd++
593					break
594				}
595			}
596			contextAfterEnd++
597		}
598		if followingLinesFound < options.ContextAfter && contextAfterEnd == length-1 && buffer[length-1] != 0x0a {
599			contextAfterEnd++
600		}
601		tmp := string(buffer[lineEnd+1 : contextAfterEnd])
602		return &tmp
603	}
604	return nil
605}
606
607// processInvertMatchesReader is used to handle the '--invert' option.
608// This function works line based and provides very limited support for options.
609func processReaderInvertMatch(reader io.Reader, matchRegexes []*regexp.Regexp, target string) error {
610	matches := make([]Match, 0, 16)
611	var linecount int64
612	var matchFound bool
613	scanner := bufio.NewScanner(reader)
614	for scanner.Scan() {
615		line := scanner.Text()
616		linecount++
617		matchFound = false
618		for _, re := range global.matchRegexes {
619			if re.MatchString(line) {
620				matchFound = true
621			}
622		}
623		if !matchFound {
624			if options.FilesWithMatches || options.FilesWithoutMatch {
625				global.resultsChan <- &Result{matches: []Match{Match{}}, target: target}
626				return nil
627			}
628			m := Match{
629				lineno: linecount,
630				line:   line}
631			matches = append(matches, m)
632
633		}
634	}
635	result := &Result{matches: matches, target: target}
636	global.resultsChan <- result
637	return nil
638}
639