1// Copyright 2017 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14//go:generate go get -u modernc.org/golex
15//go:generate golex -o=promlex.l.go promlex.l
16
17package textparse
18
19import (
20	"fmt"
21	"io"
22	"math"
23	"sort"
24	"strconv"
25	"strings"
26	"unicode/utf8"
27	"unsafe"
28
29	"github.com/pkg/errors"
30
31	"github.com/prometheus/prometheus/pkg/labels"
32	"github.com/prometheus/prometheus/pkg/value"
33)
34
35type promlexer struct {
36	b     []byte
37	i     int
38	start int
39	err   error
40	state int
41}
42
43type token int
44
45const (
46	tInvalid   token = -1
47	tEOF       token = 0
48	tLinebreak token = iota
49	tWhitespace
50	tHelp
51	tType
52	tUnit
53	tEofWord
54	tText
55	tComment
56	tBlank
57	tMName
58	tBraceOpen
59	tBraceClose
60	tLName
61	tLValue
62	tComma
63	tEqual
64	tTimestamp
65	tValue
66)
67
68func (t token) String() string {
69	switch t {
70	case tInvalid:
71		return "INVALID"
72	case tEOF:
73		return "EOF"
74	case tLinebreak:
75		return "LINEBREAK"
76	case tWhitespace:
77		return "WHITESPACE"
78	case tHelp:
79		return "HELP"
80	case tType:
81		return "TYPE"
82	case tUnit:
83		return "UNIT"
84	case tEofWord:
85		return "EOFWORD"
86	case tText:
87		return "TEXT"
88	case tComment:
89		return "COMMENT"
90	case tBlank:
91		return "BLANK"
92	case tMName:
93		return "MNAME"
94	case tBraceOpen:
95		return "BOPEN"
96	case tBraceClose:
97		return "BCLOSE"
98	case tLName:
99		return "LNAME"
100	case tLValue:
101		return "LVALUE"
102	case tEqual:
103		return "EQUAL"
104	case tComma:
105		return "COMMA"
106	case tTimestamp:
107		return "TIMESTAMP"
108	case tValue:
109		return "VALUE"
110	}
111	return fmt.Sprintf("<invalid: %d>", t)
112}
113
114// buf returns the buffer of the current token.
115func (l *promlexer) buf() []byte {
116	return l.b[l.start:l.i]
117}
118
119func (l *promlexer) cur() byte {
120	return l.b[l.i]
121}
122
123// next advances the promlexer to the next character.
124func (l *promlexer) next() byte {
125	l.i++
126	if l.i >= len(l.b) {
127		l.err = io.EOF
128		return byte(tEOF)
129	}
130	// Lex struggles with null bytes. If we are in a label value or help string, where
131	// they are allowed, consume them here immediately.
132	for l.b[l.i] == 0 && (l.state == sLValue || l.state == sMeta2 || l.state == sComment) {
133		l.i++
134	}
135	return l.b[l.i]
136}
137
138func (l *promlexer) Error(es string) {
139	l.err = errors.New(es)
140}
141
142// PromParser parses samples from a byte slice of samples in the official
143// Prometheus text exposition format.
144type PromParser struct {
145	l       *promlexer
146	series  []byte
147	text    []byte
148	mtype   MetricType
149	val     float64
150	ts      int64
151	hasTS   bool
152	start   int
153	offsets []int
154}
155
156// New returns a new parser of the byte slice.
157func NewPromParser(b []byte) Parser {
158	return &PromParser{l: &promlexer{b: append(b, '\n')}}
159}
160
161// Series returns the bytes of the series, the timestamp if set, and the value
162// of the current sample.
163func (p *PromParser) Series() ([]byte, *int64, float64) {
164	if p.hasTS {
165		return p.series, &p.ts, p.val
166	}
167	return p.series, nil, p.val
168}
169
170// Help returns the metric name and help text in the current entry.
171// Must only be called after Next returned a help entry.
172// The returned byte slices become invalid after the next call to Next.
173func (p *PromParser) Help() ([]byte, []byte) {
174	m := p.l.b[p.offsets[0]:p.offsets[1]]
175
176	// Replacer causes allocations. Replace only when necessary.
177	if strings.IndexByte(yoloString(p.text), byte('\\')) >= 0 {
178		return m, []byte(helpReplacer.Replace(string(p.text)))
179	}
180	return m, p.text
181}
182
183// Type returns the metric name and type in the current entry.
184// Must only be called after Next returned a type entry.
185// The returned byte slices become invalid after the next call to Next.
186func (p *PromParser) Type() ([]byte, MetricType) {
187	return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
188}
189
190// Unit returns the metric name and unit in the current entry.
191// Must only be called after Next returned a unit entry.
192// The returned byte slices become invalid after the next call to Next.
193func (p *PromParser) Unit() ([]byte, []byte) {
194	// The Prometheus format does not have units.
195	return nil, nil
196}
197
198// Comment returns the text of the current comment.
199// Must only be called after Next returned a comment entry.
200// The returned byte slice becomes invalid after the next call to Next.
201func (p *PromParser) Comment() []byte {
202	return p.text
203}
204
205// Metric writes the labels of the current sample into the passed labels.
206// It returns the string from which the metric was parsed.
207func (p *PromParser) Metric(l *labels.Labels) string {
208	// Allocate the full immutable string immediately, so we just
209	// have to create references on it below.
210	s := string(p.series)
211
212	*l = append(*l, labels.Label{
213		Name:  labels.MetricName,
214		Value: s[:p.offsets[0]-p.start],
215	})
216
217	for i := 1; i < len(p.offsets); i += 4 {
218		a := p.offsets[i] - p.start
219		b := p.offsets[i+1] - p.start
220		c := p.offsets[i+2] - p.start
221		d := p.offsets[i+3] - p.start
222
223		// Replacer causes allocations. Replace only when necessary.
224		if strings.IndexByte(s[c:d], byte('\\')) >= 0 {
225			*l = append(*l, labels.Label{Name: s[a:b], Value: lvalReplacer.Replace(s[c:d])})
226			continue
227		}
228		*l = append(*l, labels.Label{Name: s[a:b], Value: s[c:d]})
229	}
230
231	// Sort labels to maintain the sorted labels invariant.
232	sort.Sort(*l)
233
234	return s
235}
236
237// nextToken returns the next token from the promlexer. It skips over tabs
238// and spaces.
239func (p *PromParser) nextToken() token {
240	for {
241		if tok := p.l.Lex(); tok != tWhitespace {
242			return tok
243		}
244	}
245}
246
247func parseError(exp string, got token) error {
248	return errors.Errorf("%s, got %q", exp, got)
249}
250
251// Next advances the parser to the next sample. It returns false if no
252// more samples were read or an error occurred.
253func (p *PromParser) Next() (Entry, error) {
254	var err error
255
256	p.start = p.l.i
257	p.offsets = p.offsets[:0]
258
259	switch t := p.nextToken(); t {
260	case tEOF:
261		return EntryInvalid, io.EOF
262	case tLinebreak:
263		// Allow full blank lines.
264		return p.Next()
265
266	case tHelp, tType:
267		switch t := p.nextToken(); t {
268		case tMName:
269			p.offsets = append(p.offsets, p.l.start, p.l.i)
270		default:
271			return EntryInvalid, parseError("expected metric name after HELP", t)
272		}
273		switch t := p.nextToken(); t {
274		case tText:
275			if len(p.l.buf()) > 1 {
276				p.text = p.l.buf()[1:]
277			} else {
278				p.text = []byte{}
279			}
280		default:
281			return EntryInvalid, parseError("expected text in HELP", t)
282		}
283		switch t {
284		case tType:
285			switch s := yoloString(p.text); s {
286			case "counter":
287				p.mtype = MetricTypeCounter
288			case "gauge":
289				p.mtype = MetricTypeGauge
290			case "histogram":
291				p.mtype = MetricTypeHistogram
292			case "summary":
293				p.mtype = MetricTypeSummary
294			case "untyped":
295				p.mtype = MetricTypeUnknown
296			default:
297				return EntryInvalid, errors.Errorf("invalid metric type %q", s)
298			}
299		case tHelp:
300			if !utf8.Valid(p.text) {
301				return EntryInvalid, errors.Errorf("help text is not a valid utf8 string")
302			}
303		}
304		if t := p.nextToken(); t != tLinebreak {
305			return EntryInvalid, parseError("linebreak expected after metadata", t)
306		}
307		switch t {
308		case tHelp:
309			return EntryHelp, nil
310		case tType:
311			return EntryType, nil
312		}
313	case tComment:
314		p.text = p.l.buf()
315		if t := p.nextToken(); t != tLinebreak {
316			return EntryInvalid, parseError("linebreak expected after comment", t)
317		}
318		return EntryComment, nil
319
320	case tMName:
321		p.offsets = append(p.offsets, p.l.i)
322		p.series = p.l.b[p.start:p.l.i]
323
324		t2 := p.nextToken()
325		if t2 == tBraceOpen {
326			if err := p.parseLVals(); err != nil {
327				return EntryInvalid, err
328			}
329			p.series = p.l.b[p.start:p.l.i]
330			t2 = p.nextToken()
331		}
332		if t2 != tValue {
333			return EntryInvalid, parseError("expected value after metric", t)
334		}
335		if p.val, err = strconv.ParseFloat(yoloString(p.l.buf()), 64); err != nil {
336			return EntryInvalid, err
337		}
338		// Ensure canonical NaN value.
339		if math.IsNaN(p.val) {
340			p.val = math.Float64frombits(value.NormalNaN)
341		}
342		p.hasTS = false
343		switch p.nextToken() {
344		case tLinebreak:
345			break
346		case tTimestamp:
347			p.hasTS = true
348			if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil {
349				return EntryInvalid, err
350			}
351			if t2 := p.nextToken(); t2 != tLinebreak {
352				return EntryInvalid, parseError("expected next entry after timestamp", t)
353			}
354		default:
355			return EntryInvalid, parseError("expected timestamp or new record", t)
356		}
357		return EntrySeries, nil
358
359	default:
360		err = errors.Errorf("%q is not a valid start token", t)
361	}
362	return EntryInvalid, err
363}
364
365func (p *PromParser) parseLVals() error {
366	t := p.nextToken()
367	for {
368		switch t {
369		case tBraceClose:
370			return nil
371		case tLName:
372		default:
373			return parseError("expected label name", t)
374		}
375		p.offsets = append(p.offsets, p.l.start, p.l.i)
376
377		if t := p.nextToken(); t != tEqual {
378			return parseError("expected equal", t)
379		}
380		if t := p.nextToken(); t != tLValue {
381			return parseError("expected label value", t)
382		}
383		if !utf8.Valid(p.l.buf()) {
384			return errors.Errorf("invalid UTF-8 label value")
385		}
386
387		// The promlexer ensures the value string is quoted. Strip first
388		// and last character.
389		p.offsets = append(p.offsets, p.l.start+1, p.l.i-1)
390
391		// Free trailing commas are allowed.
392		if t = p.nextToken(); t == tComma {
393			t = p.nextToken()
394		}
395	}
396}
397
398var lvalReplacer = strings.NewReplacer(
399	`\"`, "\"",
400	`\\`, "\\",
401	`\n`, "\n",
402)
403
404var helpReplacer = strings.NewReplacer(
405	`\\`, "\\",
406	`\n`, "\n",
407)
408
409func yoloString(b []byte) string {
410	return *((*string)(unsafe.Pointer(&b)))
411}
412