1package html
2
3import (
4	"github.com/tdewolff/parse/v2"
5	"github.com/tdewolff/parse/v2/html"
6)
7
8// Token is a single token unit with an attribute value (if given) and hash of the data.
9type Token struct {
10	html.TokenType
11	Hash    Hash
12	Data    []byte
13	Text    []byte
14	AttrVal []byte
15	Traits  traits
16	Offset  int
17}
18
19// TokenBuffer is a buffer that allows for token look-ahead.
20type TokenBuffer struct {
21	r *parse.Input
22	l *html.Lexer
23
24	buf []Token
25	pos int
26
27	attrBuffer []*Token
28}
29
30// NewTokenBuffer returns a new TokenBuffer.
31func NewTokenBuffer(r *parse.Input, l *html.Lexer) *TokenBuffer {
32	return &TokenBuffer{
33		r:   r,
34		l:   l,
35		buf: make([]Token, 0, 8),
36	}
37}
38
39func (z *TokenBuffer) read(t *Token) {
40	t.Offset = z.r.Offset()
41	t.TokenType, t.Data = z.l.Next()
42	t.Text = z.l.Text()
43	if t.TokenType == html.AttributeToken {
44		t.Offset += 1 + len(t.Text) + 1
45		t.AttrVal = z.l.AttrVal()
46		if len(t.AttrVal) > 1 && (t.AttrVal[0] == '"' || t.AttrVal[0] == '\'') {
47			t.Offset++
48			t.AttrVal = t.AttrVal[1 : len(t.AttrVal)-1] // quotes will be readded in attribute loop if necessary
49		}
50		t.Hash = ToHash(t.Text)
51		t.Traits = attrMap[t.Hash]
52	} else if t.TokenType == html.StartTagToken || t.TokenType == html.EndTagToken {
53		t.AttrVal = nil
54		t.Hash = ToHash(t.Text)
55		t.Traits = tagMap[t.Hash] // zero if not exist
56	} else {
57		t.AttrVal = nil
58		t.Hash = 0
59		t.Traits = 0
60	}
61}
62
63// Peek returns the ith element and possibly does an allocation.
64// Peeking past an error will panic.
65func (z *TokenBuffer) Peek(pos int) *Token {
66	pos += z.pos
67	if pos >= len(z.buf) {
68		if len(z.buf) > 0 && z.buf[len(z.buf)-1].TokenType == html.ErrorToken {
69			return &z.buf[len(z.buf)-1]
70		}
71
72		c := cap(z.buf)
73		d := len(z.buf) - z.pos
74		p := pos - z.pos + 1 // required peek length
75		var buf []Token
76		if 2*p > c {
77			buf = make([]Token, 0, 2*c+p)
78		} else {
79			buf = z.buf
80		}
81		copy(buf[:d], z.buf[z.pos:])
82
83		buf = buf[:p]
84		pos -= z.pos
85		for i := d; i < p; i++ {
86			z.read(&buf[i])
87			if buf[i].TokenType == html.ErrorToken {
88				buf = buf[:i+1]
89				pos = i
90				break
91			}
92		}
93		z.pos, z.buf = 0, buf
94	}
95	return &z.buf[pos]
96}
97
98// Shift returns the first element and advances position.
99func (z *TokenBuffer) Shift() *Token {
100	if z.pos >= len(z.buf) {
101		t := &z.buf[:1][0]
102		z.read(t)
103		return t
104	}
105	t := &z.buf[z.pos]
106	z.pos++
107	return t
108}
109
110// Attributes extracts the gives attribute hashes from a tag.
111// It returns in the same order pointers to the requested token data or nil.
112func (z *TokenBuffer) Attributes(hashes ...Hash) []*Token {
113	n := 0
114	for {
115		if t := z.Peek(n); t.TokenType != html.AttributeToken {
116			break
117		}
118		n++
119	}
120	if len(hashes) > cap(z.attrBuffer) {
121		z.attrBuffer = make([]*Token, len(hashes))
122	} else {
123		z.attrBuffer = z.attrBuffer[:len(hashes)]
124		for i := range z.attrBuffer {
125			z.attrBuffer[i] = nil
126		}
127	}
128	for i := z.pos; i < z.pos+n; i++ {
129		attr := &z.buf[i]
130		for j, hash := range hashes {
131			if hash == attr.Hash {
132				z.attrBuffer[j] = attr
133			}
134		}
135	}
136	return z.attrBuffer
137}
138