1// Copyright 2017 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14//go:generate go get github.com/cznic/golex
15//go:generate golex -o=promlex.l.go promlex.l
16
17package textparse
18
19import (
20	"errors"
21	"fmt"
22	"io"
23	"math"
24	"sort"
25	"strconv"
26	"strings"
27	"unicode/utf8"
28	"unsafe"
29
30	"github.com/prometheus/prometheus/pkg/labels"
31	"github.com/prometheus/prometheus/pkg/value"
32)
33
34type promlexer struct {
35	b     []byte
36	i     int
37	start int
38	err   error
39	state int
40}
41
42type token int
43
44const (
45	tInvalid   token = -1
46	tEOF       token = 0
47	tLinebreak token = iota
48	tWhitespace
49	tHelp
50	tType
51	tUnit
52	tEofWord
53	tText
54	tComment
55	tBlank
56	tMName
57	tBraceOpen
58	tBraceClose
59	tLName
60	tLValue
61	tComma
62	tEqual
63	tTimestamp
64	tValue
65)
66
67func (t token) String() string {
68	switch t {
69	case tInvalid:
70		return "INVALID"
71	case tEOF:
72		return "EOF"
73	case tLinebreak:
74		return "LINEBREAK"
75	case tWhitespace:
76		return "WHITESPACE"
77	case tHelp:
78		return "HELP"
79	case tType:
80		return "TYPE"
81	case tUnit:
82		return "UNIT"
83	case tEofWord:
84		return "EOFWORD"
85	case tText:
86		return "TEXT"
87	case tComment:
88		return "COMMENT"
89	case tBlank:
90		return "BLANK"
91	case tMName:
92		return "MNAME"
93	case tBraceOpen:
94		return "BOPEN"
95	case tBraceClose:
96		return "BCLOSE"
97	case tLName:
98		return "LNAME"
99	case tLValue:
100		return "LVALUE"
101	case tEqual:
102		return "EQUAL"
103	case tComma:
104		return "COMMA"
105	case tTimestamp:
106		return "TIMESTAMP"
107	case tValue:
108		return "VALUE"
109	}
110	return fmt.Sprintf("<invalid: %d>", t)
111}
112
113// buf returns the buffer of the current token.
114func (l *promlexer) buf() []byte {
115	return l.b[l.start:l.i]
116}
117
118func (l *promlexer) cur() byte {
119	return l.b[l.i]
120}
121
122// next advances the promlexer to the next character.
123func (l *promlexer) next() byte {
124	l.i++
125	if l.i >= len(l.b) {
126		l.err = io.EOF
127		return byte(tEOF)
128	}
129	// Lex struggles with null bytes. If we are in a label value or help string, where
130	// they are allowed, consume them here immediately.
131	for l.b[l.i] == 0 && (l.state == sLValue || l.state == sMeta2 || l.state == sComment) {
132		l.i++
133	}
134	return l.b[l.i]
135}
136
137func (l *promlexer) Error(es string) {
138	l.err = errors.New(es)
139}
140
141// PromParser parses samples from a byte slice of samples in the official
142// Prometheus text exposition format.
143type PromParser struct {
144	l       *promlexer
145	series  []byte
146	text    []byte
147	mtype   MetricType
148	val     float64
149	ts      int64
150	hasTS   bool
151	start   int
152	offsets []int
153}
154
155// New returns a new parser of the byte slice.
156func NewPromParser(b []byte) Parser {
157	return &PromParser{l: &promlexer{b: append(b, '\n')}}
158}
159
160// Series returns the bytes of the series, the timestamp if set, and the value
161// of the current sample.
162func (p *PromParser) Series() ([]byte, *int64, float64) {
163	if p.hasTS {
164		return p.series, &p.ts, p.val
165	}
166	return p.series, nil, p.val
167}
168
169// Help returns the metric name and help text in the current entry.
170// Must only be called after Next returned a help entry.
171// The returned byte slices become invalid after the next call to Next.
172func (p *PromParser) Help() ([]byte, []byte) {
173	m := p.l.b[p.offsets[0]:p.offsets[1]]
174
175	// Replacer causes allocations. Replace only when necessary.
176	if strings.IndexByte(yoloString(p.text), byte('\\')) >= 0 {
177		return m, []byte(helpReplacer.Replace(string(p.text)))
178	}
179	return m, p.text
180}
181
182// Type returns the metric name and type in the current entry.
183// Must only be called after Next returned a type entry.
184// The returned byte slices become invalid after the next call to Next.
185func (p *PromParser) Type() ([]byte, MetricType) {
186	return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
187}
188
189// Unit returns the metric name and unit in the current entry.
190// Must only be called after Next returned a unit entry.
191// The returned byte slices become invalid after the next call to Next.
192func (p *PromParser) Unit() ([]byte, []byte) {
193	// The Prometheus format does not have units.
194	return nil, nil
195}
196
197// Comment returns the text of the current comment.
198// Must only be called after Next returned a comment entry.
199// The returned byte slice becomes invalid after the next call to Next.
200func (p *PromParser) Comment() []byte {
201	return p.text
202}
203
204// Metric writes the labels of the current sample into the passed labels.
205// It returns the string from which the metric was parsed.
206func (p *PromParser) Metric(l *labels.Labels) string {
207	// Allocate the full immutable string immediately, so we just
208	// have to create references on it below.
209	s := string(p.series)
210
211	*l = append(*l, labels.Label{
212		Name:  labels.MetricName,
213		Value: s[:p.offsets[0]-p.start],
214	})
215
216	for i := 1; i < len(p.offsets); i += 4 {
217		a := p.offsets[i] - p.start
218		b := p.offsets[i+1] - p.start
219		c := p.offsets[i+2] - p.start
220		d := p.offsets[i+3] - p.start
221
222		// Replacer causes allocations. Replace only when necessary.
223		if strings.IndexByte(s[c:d], byte('\\')) >= 0 {
224			*l = append(*l, labels.Label{Name: s[a:b], Value: lvalReplacer.Replace(s[c:d])})
225			continue
226		}
227		*l = append(*l, labels.Label{Name: s[a:b], Value: s[c:d]})
228	}
229
230	// Sort labels. We can skip the first entry since the metric name is
231	// already at the right place.
232	sort.Sort((*l)[1:])
233
234	return s
235}
236
237// nextToken returns the next token from the promlexer. It skips over tabs
238// and spaces.
239func (p *PromParser) nextToken() token {
240	for {
241		if tok := p.l.Lex(); tok != tWhitespace {
242			return tok
243		}
244	}
245}
246
247func parseError(exp string, got token) error {
248	return fmt.Errorf("%s, got %q", exp, got)
249}
250
251// Next advances the parser to the next sample. It returns false if no
252// more samples were read or an error occurred.
253func (p *PromParser) Next() (Entry, error) {
254	var err error
255
256	p.start = p.l.i
257	p.offsets = p.offsets[:0]
258
259	switch t := p.nextToken(); t {
260	case tEOF:
261		return EntryInvalid, io.EOF
262	case tLinebreak:
263		// Allow full blank lines.
264		return p.Next()
265
266	case tHelp, tType:
267		switch t := p.nextToken(); t {
268		case tMName:
269			p.offsets = append(p.offsets, p.l.start, p.l.i)
270		default:
271			return EntryInvalid, parseError("expected metric name after HELP", t)
272		}
273		switch t := p.nextToken(); t {
274		case tText:
275			if len(p.l.buf()) > 1 {
276				p.text = p.l.buf()[1:]
277			} else {
278				p.text = []byte{}
279			}
280		default:
281			return EntryInvalid, parseError("expected text in HELP", t)
282		}
283		switch t {
284		case tType:
285			switch s := yoloString(p.text); s {
286			case "counter":
287				p.mtype = MetricTypeCounter
288			case "gauge":
289				p.mtype = MetricTypeGauge
290			case "histogram":
291				p.mtype = MetricTypeHistogram
292			case "summary":
293				p.mtype = MetricTypeSummary
294			case "untyped":
295				p.mtype = MetricTypeUnknown
296			default:
297				return EntryInvalid, fmt.Errorf("invalid metric type %q", s)
298			}
299		case tHelp:
300			if !utf8.Valid(p.text) {
301				return EntryInvalid, fmt.Errorf("help text is not a valid utf8 string")
302			}
303		}
304		if t := p.nextToken(); t != tLinebreak {
305			return EntryInvalid, parseError("linebreak expected after metadata", t)
306		}
307		switch t {
308		case tHelp:
309			return EntryHelp, nil
310		case tType:
311			return EntryType, nil
312		}
313	case tComment:
314		p.text = p.l.buf()
315		if t := p.nextToken(); t != tLinebreak {
316			return EntryInvalid, parseError("linebreak expected after comment", t)
317		}
318		return EntryComment, nil
319
320	case tMName:
321		p.offsets = append(p.offsets, p.l.i)
322		p.series = p.l.b[p.start:p.l.i]
323
324		t2 := p.nextToken()
325		if t2 == tBraceOpen {
326			if err := p.parseLVals(); err != nil {
327				return EntryInvalid, err
328			}
329			p.series = p.l.b[p.start:p.l.i]
330			t2 = p.nextToken()
331		}
332		if t2 != tValue {
333			return EntryInvalid, parseError("expected value after metric", t)
334		}
335		if p.val, err = strconv.ParseFloat(yoloString(p.l.buf()), 64); err != nil {
336			return EntryInvalid, err
337		}
338		// Ensure canonical NaN value.
339		if math.IsNaN(p.val) {
340			p.val = math.Float64frombits(value.NormalNaN)
341		}
342		p.hasTS = false
343		switch p.nextToken() {
344		case tLinebreak:
345			break
346		case tTimestamp:
347			p.hasTS = true
348			if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil {
349				return EntryInvalid, err
350			}
351			if t2 := p.nextToken(); t2 != tLinebreak {
352				return EntryInvalid, parseError("expected next entry after timestamp", t)
353			}
354		default:
355			return EntryInvalid, parseError("expected timestamp or new record", t)
356		}
357		return EntrySeries, nil
358
359	default:
360		err = fmt.Errorf("%q is not a valid start token", t)
361	}
362	return EntryInvalid, err
363}
364
365func (p *PromParser) parseLVals() error {
366	t := p.nextToken()
367	for {
368		switch t {
369		case tBraceClose:
370			return nil
371		case tLName:
372		default:
373			return parseError("expected label name", t)
374		}
375		p.offsets = append(p.offsets, p.l.start, p.l.i)
376
377		if t := p.nextToken(); t != tEqual {
378			return parseError("expected equal", t)
379		}
380		if t := p.nextToken(); t != tLValue {
381			return parseError("expected label value", t)
382		}
383		if !utf8.Valid(p.l.buf()) {
384			return fmt.Errorf("invalid UTF-8 label value")
385		}
386
387		// The promlexer ensures the value string is quoted. Strip first
388		// and last character.
389		p.offsets = append(p.offsets, p.l.start+1, p.l.i-1)
390
391		// Free trailing commas are allowed.
392		if t = p.nextToken(); t == tComma {
393			t = p.nextToken()
394		}
395	}
396}
397
398var lvalReplacer = strings.NewReplacer(
399	`\"`, "\"",
400	`\\`, "\\",
401	`\n`, "\n",
402)
403
404var helpReplacer = strings.NewReplacer(
405	`\\`, "\\",
406	`\n`, "\n",
407)
408
409func yoloString(b []byte) string {
410	return *((*string)(unsafe.Pointer(&b)))
411}
412