1// Copyright 2017 The Prometheus Authors
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14//go:generate go get -u modernc.org/golex
15//go:generate golex -o=promlex.l.go promlex.l
16
17package textparse
18
19import (
20	"fmt"
21	"io"
22	"math"
23	"sort"
24	"strconv"
25	"strings"
26	"unicode/utf8"
27	"unsafe"
28
29	"github.com/pkg/errors"
30
31	"github.com/prometheus/prometheus/pkg/exemplar"
32	"github.com/prometheus/prometheus/pkg/labels"
33	"github.com/prometheus/prometheus/pkg/value"
34)
35
36type promlexer struct {
37	b     []byte
38	i     int
39	start int
40	err   error
41	state int
42}
43
44type token int
45
46const (
47	tInvalid   token = -1
48	tEOF       token = 0
49	tLinebreak token = iota
50	tWhitespace
51	tHelp
52	tType
53	tUnit
54	tEOFWord
55	tText
56	tComment
57	tBlank
58	tMName
59	tBraceOpen
60	tBraceClose
61	tLName
62	tLValue
63	tComma
64	tEqual
65	tTimestamp
66	tValue
67)
68
69func (t token) String() string {
70	switch t {
71	case tInvalid:
72		return "INVALID"
73	case tEOF:
74		return "EOF"
75	case tLinebreak:
76		return "LINEBREAK"
77	case tWhitespace:
78		return "WHITESPACE"
79	case tHelp:
80		return "HELP"
81	case tType:
82		return "TYPE"
83	case tUnit:
84		return "UNIT"
85	case tEOFWord:
86		return "EOFWORD"
87	case tText:
88		return "TEXT"
89	case tComment:
90		return "COMMENT"
91	case tBlank:
92		return "BLANK"
93	case tMName:
94		return "MNAME"
95	case tBraceOpen:
96		return "BOPEN"
97	case tBraceClose:
98		return "BCLOSE"
99	case tLName:
100		return "LNAME"
101	case tLValue:
102		return "LVALUE"
103	case tEqual:
104		return "EQUAL"
105	case tComma:
106		return "COMMA"
107	case tTimestamp:
108		return "TIMESTAMP"
109	case tValue:
110		return "VALUE"
111	}
112	return fmt.Sprintf("<invalid: %d>", t)
113}
114
115// buf returns the buffer of the current token.
116func (l *promlexer) buf() []byte {
117	return l.b[l.start:l.i]
118}
119
120func (l *promlexer) cur() byte {
121	return l.b[l.i]
122}
123
124// next advances the promlexer to the next character.
125func (l *promlexer) next() byte {
126	l.i++
127	if l.i >= len(l.b) {
128		l.err = io.EOF
129		return byte(tEOF)
130	}
131	// Lex struggles with null bytes. If we are in a label value or help string, where
132	// they are allowed, consume them here immediately.
133	for l.b[l.i] == 0 && (l.state == sLValue || l.state == sMeta2 || l.state == sComment) {
134		l.i++
135	}
136	return l.b[l.i]
137}
138
139func (l *promlexer) Error(es string) {
140	l.err = errors.New(es)
141}
142
143// PromParser parses samples from a byte slice of samples in the official
144// Prometheus text exposition format.
145type PromParser struct {
146	l       *promlexer
147	series  []byte
148	text    []byte
149	mtype   MetricType
150	val     float64
151	ts      int64
152	hasTS   bool
153	start   int
154	offsets []int
155}
156
157// NewPromParser returns a new parser of the byte slice.
158func NewPromParser(b []byte) Parser {
159	return &PromParser{l: &promlexer{b: append(b, '\n')}}
160}
161
162// Series returns the bytes of the series, the timestamp if set, and the value
163// of the current sample.
164func (p *PromParser) Series() ([]byte, *int64, float64) {
165	if p.hasTS {
166		return p.series, &p.ts, p.val
167	}
168	return p.series, nil, p.val
169}
170
171// Help returns the metric name and help text in the current entry.
172// Must only be called after Next returned a help entry.
173// The returned byte slices become invalid after the next call to Next.
174func (p *PromParser) Help() ([]byte, []byte) {
175	m := p.l.b[p.offsets[0]:p.offsets[1]]
176
177	// Replacer causes allocations. Replace only when necessary.
178	if strings.IndexByte(yoloString(p.text), byte('\\')) >= 0 {
179		return m, []byte(helpReplacer.Replace(string(p.text)))
180	}
181	return m, p.text
182}
183
184// Type returns the metric name and type in the current entry.
185// Must only be called after Next returned a type entry.
186// The returned byte slices become invalid after the next call to Next.
187func (p *PromParser) Type() ([]byte, MetricType) {
188	return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
189}
190
191// Unit returns the metric name and unit in the current entry.
192// Must only be called after Next returned a unit entry.
193// The returned byte slices become invalid after the next call to Next.
194func (p *PromParser) Unit() ([]byte, []byte) {
195	// The Prometheus format does not have units.
196	return nil, nil
197}
198
199// Comment returns the text of the current comment.
200// Must only be called after Next returned a comment entry.
201// The returned byte slice becomes invalid after the next call to Next.
202func (p *PromParser) Comment() []byte {
203	return p.text
204}
205
206// Metric writes the labels of the current sample into the passed labels.
207// It returns the string from which the metric was parsed.
208func (p *PromParser) Metric(l *labels.Labels) string {
209	// Allocate the full immutable string immediately, so we just
210	// have to create references on it below.
211	s := string(p.series)
212
213	*l = append(*l, labels.Label{
214		Name:  labels.MetricName,
215		Value: s[:p.offsets[0]-p.start],
216	})
217
218	for i := 1; i < len(p.offsets); i += 4 {
219		a := p.offsets[i] - p.start
220		b := p.offsets[i+1] - p.start
221		c := p.offsets[i+2] - p.start
222		d := p.offsets[i+3] - p.start
223
224		// Replacer causes allocations. Replace only when necessary.
225		if strings.IndexByte(s[c:d], byte('\\')) >= 0 {
226			*l = append(*l, labels.Label{Name: s[a:b], Value: lvalReplacer.Replace(s[c:d])})
227			continue
228		}
229		*l = append(*l, labels.Label{Name: s[a:b], Value: s[c:d]})
230	}
231
232	// Sort labels to maintain the sorted labels invariant.
233	sort.Sort(*l)
234
235	return s
236}
237
238// Exemplar writes the exemplar of the current sample into the passed
239// exemplar. It returns if an exemplar exists.
240func (p *PromParser) Exemplar(e *exemplar.Exemplar) bool {
241	return false
242}
243
244// nextToken returns the next token from the promlexer. It skips over tabs
245// and spaces.
246func (p *PromParser) nextToken() token {
247	for {
248		if tok := p.l.Lex(); tok != tWhitespace {
249			return tok
250		}
251	}
252}
253
254func parseError(exp string, got token) error {
255	return errors.Errorf("%s, got %q", exp, got)
256}
257
258// Next advances the parser to the next sample. It returns false if no
259// more samples were read or an error occurred.
260func (p *PromParser) Next() (Entry, error) {
261	var err error
262
263	p.start = p.l.i
264	p.offsets = p.offsets[:0]
265
266	switch t := p.nextToken(); t {
267	case tEOF:
268		return EntryInvalid, io.EOF
269	case tLinebreak:
270		// Allow full blank lines.
271		return p.Next()
272
273	case tHelp, tType:
274		switch t := p.nextToken(); t {
275		case tMName:
276			p.offsets = append(p.offsets, p.l.start, p.l.i)
277		default:
278			return EntryInvalid, parseError("expected metric name after HELP", t)
279		}
280		switch t := p.nextToken(); t {
281		case tText:
282			if len(p.l.buf()) > 1 {
283				p.text = p.l.buf()[1:]
284			} else {
285				p.text = []byte{}
286			}
287		default:
288			return EntryInvalid, parseError("expected text in HELP", t)
289		}
290		switch t {
291		case tType:
292			switch s := yoloString(p.text); s {
293			case "counter":
294				p.mtype = MetricTypeCounter
295			case "gauge":
296				p.mtype = MetricTypeGauge
297			case "histogram":
298				p.mtype = MetricTypeHistogram
299			case "summary":
300				p.mtype = MetricTypeSummary
301			case "untyped":
302				p.mtype = MetricTypeUnknown
303			default:
304				return EntryInvalid, errors.Errorf("invalid metric type %q", s)
305			}
306		case tHelp:
307			if !utf8.Valid(p.text) {
308				return EntryInvalid, errors.Errorf("help text is not a valid utf8 string")
309			}
310		}
311		if t := p.nextToken(); t != tLinebreak {
312			return EntryInvalid, parseError("linebreak expected after metadata", t)
313		}
314		switch t {
315		case tHelp:
316			return EntryHelp, nil
317		case tType:
318			return EntryType, nil
319		}
320	case tComment:
321		p.text = p.l.buf()
322		if t := p.nextToken(); t != tLinebreak {
323			return EntryInvalid, parseError("linebreak expected after comment", t)
324		}
325		return EntryComment, nil
326
327	case tMName:
328		p.offsets = append(p.offsets, p.l.i)
329		p.series = p.l.b[p.start:p.l.i]
330
331		t2 := p.nextToken()
332		if t2 == tBraceOpen {
333			if err := p.parseLVals(); err != nil {
334				return EntryInvalid, err
335			}
336			p.series = p.l.b[p.start:p.l.i]
337			t2 = p.nextToken()
338		}
339		if t2 != tValue {
340			return EntryInvalid, parseError("expected value after metric", t)
341		}
342		if p.val, err = parseFloat(yoloString(p.l.buf())); err != nil {
343			return EntryInvalid, err
344		}
345		// Ensure canonical NaN value.
346		if math.IsNaN(p.val) {
347			p.val = math.Float64frombits(value.NormalNaN)
348		}
349		p.hasTS = false
350		switch p.nextToken() {
351		case tLinebreak:
352			break
353		case tTimestamp:
354			p.hasTS = true
355			if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil {
356				return EntryInvalid, err
357			}
358			if t2 := p.nextToken(); t2 != tLinebreak {
359				return EntryInvalid, parseError("expected next entry after timestamp", t)
360			}
361		default:
362			return EntryInvalid, parseError("expected timestamp or new record", t)
363		}
364		return EntrySeries, nil
365
366	default:
367		err = errors.Errorf("%q is not a valid start token", t)
368	}
369	return EntryInvalid, err
370}
371
372func (p *PromParser) parseLVals() error {
373	t := p.nextToken()
374	for {
375		switch t {
376		case tBraceClose:
377			return nil
378		case tLName:
379		default:
380			return parseError("expected label name", t)
381		}
382		p.offsets = append(p.offsets, p.l.start, p.l.i)
383
384		if t := p.nextToken(); t != tEqual {
385			return parseError("expected equal", t)
386		}
387		if t := p.nextToken(); t != tLValue {
388			return parseError("expected label value", t)
389		}
390		if !utf8.Valid(p.l.buf()) {
391			return errors.Errorf("invalid UTF-8 label value")
392		}
393
394		// The promlexer ensures the value string is quoted. Strip first
395		// and last character.
396		p.offsets = append(p.offsets, p.l.start+1, p.l.i-1)
397
398		// Free trailing commas are allowed.
399		if t = p.nextToken(); t == tComma {
400			t = p.nextToken()
401		}
402	}
403}
404
405var lvalReplacer = strings.NewReplacer(
406	`\"`, "\"",
407	`\\`, "\\",
408	`\n`, "\n",
409)
410
411var helpReplacer = strings.NewReplacer(
412	`\\`, "\\",
413	`\n`, "\n",
414)
415
416func yoloString(b []byte) string {
417	return *((*string)(unsafe.Pointer(&b)))
418}
419
420func parseFloat(s string) (float64, error) {
421	// Keep to pre-Go 1.13 float formats.
422	if strings.ContainsAny(s, "pP_") {
423		return 0, fmt.Errorf("unsupported character in float")
424	}
425	return strconv.ParseFloat(s, 64)
426}
427