1-- © 2008-2013 David Given.
2-- WordGrinder is licensed under the MIT open source license. See the COPYING
3-- file in this distribution for the full text.
4
5local ITALIC = wg.ITALIC
6local UNDERLINE = wg.UNDERLINE
7local BOLD = wg.BOLD
8local ParseWord = wg.parseword
9local WriteU8 = wg.writeu8
10local bitand = bit32.band
11local bitor = bit32.bor
12local bitxor = bit32.bxor
13local bit = bit32.btest
14local string_char = string.char
15local string_find = string.find
16local string_sub = string.sub
17local table_concat = table.concat
18
19-----------------------------------------------------------------------------
20-- The importer itself.
21
22local function loadhtmlfile(fp)
23	local data = fp:read("*a")
24
25	-- Collapse whitespace; this makes things far easier to parse.
26
27	data = data:gsub("[\t\f]", " ")
28	data = data:gsub("\r\n", "\n")
29
30	-- Canonicalise the string, making it valid UTF-8.
31
32	data = CanonicaliseString(data)
33
34	-- Collapse complex elements.
35
36	data = data:gsub("< ?(%w+) ?[^>]*(/?)>", "<%1%2>")
37
38	-- Helper function for reading tokens from the HTML stream.
39
40	local pos = 1
41	local len = data:len()
42	local function tokens()
43		if (pos >= len) then
44			return nil
45		end
46
47		local s, e, t
48		s, e, t = string_find(data, "^([ \n])", pos)
49		if s then pos = e+1 return t end
50
51		if string_find(data, "^%c") then
52			pos = pos + 1
53			return tokens()
54		end
55
56		s, e, t = string_find(data, "^(<[^>]*>)", pos)
57		if s then pos = e+1 return t:lower() end
58
59		s, e, t = string_find(data, "^(&[^;]-;)", pos)
60		if s then pos = e+1 return t end
61
62		s, e, t = string_find(data, "^([^ <&\n]+)", pos)
63		if s then pos = e+1 return t end
64
65		t = string_sub(data, pos, pos+1)
66		pos = pos + 1
67		return t
68	end
69
70	-- Skip tokens until we hit a <body>.
71
72	for t in tokens do
73		if (t == "<body>") then
74			break
75		end
76	end
77
78	-- Define the element look-up table.
79
80	local document = CreateDocument()
81	local importer = CreateImporter(document)
82	local style = "P"
83	local pre = false
84
85	local function flush()
86		importer:flushparagraph(style)
87		style = "P"
88	end
89
90	local function flushword()
91		importer:flushword(pre)
92	end
93
94	local function flushpre()
95		flush()
96		if pre then
97			style = "PRE"
98		end
99	end
100
101	local elements =
102	{
103		[" "] = flushword,
104		["<p>"] = flush,
105		["<br>"] = flushpre,
106		["<br/>"] = flushpre,
107		["</h1>"] = flush,
108		["</h2>"] = flush,
109		["</h3>"] = flush,
110		["</h4>"] = flush,
111		["<h1>"] = function() flush() style = "H1" end,
112		["<h2>"] = function() flush() style = "H2" end,
113		["<h3>"] = function() flush() style = "H3" end,
114		["<h4>"] = function() flush() style = "H4" end,
115		["<li>"] = function() flush() style = "LB" end,
116		["<i>"] = function() importer:style_on(ITALIC) end,
117		["</i>"] = function() importer:style_off(ITALIC) end,
118		["<em>"] = function() importer:style_on(ITALIC) end,
119		["</em>"] = function() importer:style_off(ITALIC) end,
120		["<u>"] = function() importer:style_on(UNDERLINE) end,
121		["</u>"] = function() importer:style_off(UNDERLINE) end,
122		["<b>"] = function() importer:style_on(BOLD) end,
123		["</b>"] = function() importer:style_off(BOLD) end,
124		["<pre>"] = function() flush() style = "PRE" pre = true end,
125		["</pre>"] = function() flush() pre = false end,
126		["\n"] = function() if pre then flush() style = "PRE" else flushword() end end
127	}
128
129	-- Actually do the parsing.
130
131	importer:reset()
132	for t in tokens do
133		local e = elements[t]
134		if e then
135			e()
136		elseif string_find(t, "^<") then
137			-- do nothing
138		elseif string_find(t, "^&") then
139			e = DecodeHTMLEntity(t)
140			if e then
141				importer:text(e)
142			end
143		else
144			importer:text(t)
145		end
146	end
147	flush()
148
149	return document
150end
151
152function Cmd.ImportHTMLFile(filename)
153	return ImportFileWithUI(filename, "Import HTML File", loadhtmlfile)
154end
155