1package xml
2
3import (
4	"fmt"
5	"io"
6	"testing"
7
8	"github.com/tdewolff/parse/v2"
9	"github.com/tdewolff/test"
10)
11
12type TTs []TokenType
13
14func TestTokens(t *testing.T) {
15	var tokenTests = []struct {
16		xml      string
17		expected []TokenType
18	}{
19		{"", TTs{}},
20		{"<!-- comment -->", TTs{CommentToken}},
21		{"<!-- comment \n multi \r line -->", TTs{CommentToken}},
22		{"<foo/>", TTs{StartTagToken, StartTagCloseVoidToken}},
23		{"<foo \t\r\n/>", TTs{StartTagToken, StartTagCloseVoidToken}},
24		{"<foo:bar.qux-norf/>", TTs{StartTagToken, StartTagCloseVoidToken}},
25		{"<foo></foo>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
26		{"<foo>text</foo>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
27		{"<foo/> text", TTs{StartTagToken, StartTagCloseVoidToken, TextToken}},
28		{"<a> <b> <c>text</c> </b> </a>", TTs{StartTagToken, StartTagCloseToken, TextToken, StartTagToken, StartTagCloseToken, TextToken, StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken, TextToken, EndTagToken}},
29		{"<foo a='a' b=\"b\" c=c/>", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, StartTagCloseVoidToken}},
30		{"<foo a=\"\"/>", TTs{StartTagToken, AttributeToken, StartTagCloseVoidToken}},
31		{"<foo a-b=\"\"/>", TTs{StartTagToken, AttributeToken, StartTagCloseVoidToken}},
32		{"<foo \nchecked \r\n value\r=\t'=/>\"' />", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagCloseVoidToken}},
33		{"<?xml?>", TTs{StartTagPIToken, StartTagClosePIToken}},
34		{"<?xml a=\"a\" ?>", TTs{StartTagPIToken, AttributeToken, StartTagClosePIToken}},
35		{"<?xml a=a?>", TTs{StartTagPIToken, AttributeToken, StartTagClosePIToken}},
36		{"<![CDATA[ test ]]>", TTs{CDATAToken}},
37		{"<!DOCTYPE>", TTs{DOCTYPEToken}},
38		{"<!DOCTYPE note SYSTEM \"Note.dtd\">", TTs{DOCTYPEToken}},
39		{`<!DOCTYPE note [<!ENTITY nbsp "&#xA0;"><!ENTITY writer "Writer: Donald Duck."><!ENTITY copyright "Copyright:]> W3Schools.">]>`, TTs{DOCTYPEToken}},
40		{"<!foo>", TTs{StartTagToken, StartTagCloseToken}},
41
42		// early endings
43		{"<!-- comment", TTs{CommentToken}},
44		{"<foo", TTs{StartTagToken}},
45		{"</foo", TTs{EndTagToken}},
46		{"<foo x", TTs{StartTagToken, AttributeToken}},
47		{"<foo x=", TTs{StartTagToken, AttributeToken}},
48		{"<foo x='", TTs{StartTagToken, AttributeToken}},
49		{"<foo x=''", TTs{StartTagToken, AttributeToken}},
50		{"<?xml", TTs{StartTagPIToken}},
51		{"<![CDATA[ test", TTs{CDATAToken}},
52		{"<!DOCTYPE note SYSTEM", TTs{DOCTYPEToken}},
53
54		// go fuzz
55		{"</", TTs{EndTagToken}},
56		{"</\n", TTs{EndTagToken}},
57	}
58	for _, tt := range tokenTests {
59		t.Run(tt.xml, func(t *testing.T) {
60			l := NewLexer(parse.NewInputString(tt.xml))
61			i := 0
62			for {
63				token, _ := l.Next()
64				if token == ErrorToken {
65					test.T(t, l.Err(), io.EOF)
66					test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
67					break
68				}
69				test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected))
70				if i < len(tt.expected) {
71					test.T(t, token, tt.expected[i], "token types must match")
72				}
73				i++
74			}
75		})
76	}
77
78	// coverage
79	for i := 0; ; i++ {
80		if TokenType(i).String() == fmt.Sprintf("Invalid(%d)", i) {
81			break
82		}
83	}
84}
85
86func TestTags(t *testing.T) {
87	var tagTests = []struct {
88		xml      string
89		expected string
90	}{
91		{"<foo:bar.qux-norf/>", "foo:bar.qux-norf"},
92		{"<?xml?>", "xml"},
93		{"<foo?bar/qux>", "foo?bar/qux"},
94		{"<!DOCTYPE note SYSTEM \"Note.dtd\">", " note SYSTEM \"Note.dtd\""},
95
96		// early endings
97		{"<foo ", "foo"},
98	}
99	for _, tt := range tagTests {
100		t.Run(tt.xml, func(t *testing.T) {
101			l := NewLexer(parse.NewInputString(tt.xml))
102			for {
103				token, _ := l.Next()
104				if token == ErrorToken {
105					test.T(t, l.Err(), io.EOF)
106					test.Fail(t, "when error occurred we must be at the end")
107					break
108				} else if token == StartTagToken || token == StartTagPIToken || token == EndTagToken || token == DOCTYPEToken {
109					test.String(t, string(l.Text()), tt.expected, "tags must match")
110					break
111				}
112			}
113		})
114	}
115}
116
117func TestAttributes(t *testing.T) {
118	var attributeTests = []struct {
119		attr     string
120		expected []string
121	}{
122		{"<foo a=\"b\" />", []string{"a", "\"b\""}},
123		{"<foo \nchecked \r\n value\r=\t'=/>\"' />", []string{"checked", "", "value", "'=/>\"'"}},
124		{"<foo bar=\" a \n\t\r b \" />", []string{"bar", "\" a     b \""}},
125		{"<?xml a=b?>", []string{"a", "b"}},
126		{"<foo /=? >", []string{"/", "?"}},
127
128		// early endings
129		{"<foo x", []string{"x", ""}},
130		{"<foo x=", []string{"x", ""}},
131		{"<foo x='", []string{"x", "'"}},
132	}
133	for _, tt := range attributeTests {
134		t.Run(tt.attr, func(t *testing.T) {
135			l := NewLexer(parse.NewInputString(tt.attr))
136			i := 0
137			for {
138				token, _ := l.Next()
139				if token == ErrorToken {
140					test.T(t, l.Err(), io.EOF)
141					test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
142					break
143				} else if token == AttributeToken {
144					test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected))
145					if i+1 < len(tt.expected) {
146						test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match")
147						test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match")
148						i += 2
149					}
150				}
151			}
152		})
153	}
154}
155
156func TestErrors(t *testing.T) {
157	var errorTests = []struct {
158		xml string
159		col int
160	}{
161		{"a\x00b", 2},
162		{"<\x00 b='5'>", 2},
163		{"<a\x00b='5'>", 3},
164		{"<a \x00='5'>", 4},
165		{"<a b\x00'5'>", 5},
166		{"<a b=\x005'>", 6},
167		{"<a b='\x00'>", 7},
168		{"<a b='5\x00>", 8},
169		{"<a b='5'\x00", 9},
170		{"</\x00a>", 3},
171		{"</ \x00>", 4},
172		{"</ a\x00", 5},
173		{"<!\x00", 3},
174		{"<![CDATA[\x00", 10},
175		{"/*\x00", 3},
176	}
177	for _, tt := range errorTests {
178		t.Run(tt.xml, func(t *testing.T) {
179			l := NewLexer(parse.NewInputString(tt.xml))
180			for {
181				token, _ := l.Next()
182				if token == ErrorToken {
183					if perr, ok := l.Err().(*parse.Error); ok {
184						_, col, _ := perr.Position()
185						test.T(t, col, tt.col)
186					} else {
187						test.Fail(t, "bad error:", l.Err())
188					}
189					break
190				}
191			}
192		})
193	}
194}
195
196func TestTextAndAttrVal(t *testing.T) {
197	l := NewLexer(parse.NewInputString(`<xml attr="val" >text<!--comment--><!DOCTYPE doctype><![CDATA[cdata]]>`))
198	_, data := l.Next()
199	test.Bytes(t, data, []byte("<xml"))
200	test.Bytes(t, l.Text(), []byte("xml"))
201	test.Bytes(t, l.AttrVal(), nil)
202
203	_, data = l.Next()
204	test.Bytes(t, data, []byte(` attr="val"`))
205	test.Bytes(t, l.Text(), []byte("attr"))
206	test.Bytes(t, l.AttrVal(), []byte(`"val"`))
207
208	_, data = l.Next()
209	test.Bytes(t, data, []byte(">"))
210	test.Bytes(t, l.Text(), nil)
211	test.Bytes(t, l.AttrVal(), nil)
212
213	_, data = l.Next()
214	test.Bytes(t, data, []byte("text"))
215	test.Bytes(t, l.Text(), []byte("text"))
216	test.Bytes(t, l.AttrVal(), nil)
217
218	_, data = l.Next()
219	test.Bytes(t, data, []byte("<!--comment-->"))
220	test.Bytes(t, l.Text(), []byte("comment"))
221	test.Bytes(t, l.AttrVal(), nil)
222
223	_, data = l.Next()
224	test.Bytes(t, data, []byte("<!DOCTYPE doctype>"))
225	test.Bytes(t, l.Text(), []byte(" doctype"))
226	test.Bytes(t, l.AttrVal(), nil)
227
228	_, data = l.Next()
229	test.Bytes(t, data, []byte("<![CDATA[cdata]]>"))
230	test.Bytes(t, l.Text(), []byte("cdata"))
231	test.Bytes(t, l.AttrVal(), nil)
232}
233
234func TestOffset(t *testing.T) {
235	z := parse.NewInputString(`<div attr="val">text</div>`)
236	l := NewLexer(z)
237	test.T(t, z.Offset(), 0)
238	_, _ = l.Next()
239	test.T(t, z.Offset(), 4) // <div
240	_, _ = l.Next()
241	test.T(t, z.Offset(), 15) // attr="val"
242	_, _ = l.Next()
243	test.T(t, z.Offset(), 16) // >
244	_, _ = l.Next()
245	test.T(t, z.Offset(), 20) // text
246	_, _ = l.Next()
247	test.T(t, z.Offset(), 26) // </div>
248}
249
250////////////////////////////////////////////////////////////////
251
252func ExampleNewLexer() {
253	l := NewLexer(parse.NewInputString("<span class='user'>John Doe</span>"))
254	out := ""
255	for {
256		tt, data := l.Next()
257		if tt == ErrorToken {
258			break
259		}
260		out += string(data)
261	}
262	fmt.Println(out)
263	// Output: <span class='user'>John Doe</span>
264}
265