1-- © 2008-2013 David Given. 2-- WordGrinder is licensed under the MIT open source license. See the COPYING 3-- file in this distribution for the full text. 4 5local ITALIC = wg.ITALIC 6local UNDERLINE = wg.UNDERLINE 7local BOLD = wg.BOLD 8local ParseWord = wg.parseword 9local WriteU8 = wg.writeu8 10local bitand = bit32.band 11local bitor = bit32.bor 12local bitxor = bit32.bxor 13local bit = bit32.btest 14local string_char = string.char 15local string_find = string.find 16local string_sub = string.sub 17local table_concat = table.concat 18 19----------------------------------------------------------------------------- 20-- The importer itself. 21 22local function loadhtmlfile(fp) 23 local data = fp:read("*a") 24 25 -- Collapse whitespace; this makes things far easier to parse. 26 27 data = data:gsub("[\t\f]", " ") 28 data = data:gsub("\r\n", "\n") 29 30 -- Canonicalise the string, making it valid UTF-8. 31 32 data = CanonicaliseString(data) 33 34 -- Collapse complex elements. 35 36 data = data:gsub("< ?(%w+) ?[^>]*(/?)>", "<%1%2>") 37 38 -- Helper function for reading tokens from the HTML stream. 39 40 local pos = 1 41 local len = data:len() 42 local function tokens() 43 if (pos >= len) then 44 return nil 45 end 46 47 local s, e, t 48 s, e, t = string_find(data, "^([ \n])", pos) 49 if s then pos = e+1 return t end 50 51 if string_find(data, "^%c") then 52 pos = pos + 1 53 return tokens() 54 end 55 56 s, e, t = string_find(data, "^(<[^>]*>)", pos) 57 if s then pos = e+1 return t:lower() end 58 59 s, e, t = string_find(data, "^(&[^;]-;)", pos) 60 if s then pos = e+1 return t end 61 62 s, e, t = string_find(data, "^([^ <&\n]+)", pos) 63 if s then pos = e+1 return t end 64 65 t = string_sub(data, pos, pos+1) 66 pos = pos + 1 67 return t 68 end 69 70 -- Skip tokens until we hit a <body>. 71 72 for t in tokens do 73 if (t == "<body>") then 74 break 75 end 76 end 77 78 -- Define the element look-up table. 79 80 local document = CreateDocument() 81 local importer = CreateImporter(document) 82 local style = "P" 83 local pre = false 84 85 local function flush() 86 importer:flushparagraph(style) 87 style = "P" 88 end 89 90 local function flushword() 91 importer:flushword(pre) 92 end 93 94 local function flushpre() 95 flush() 96 if pre then 97 style = "PRE" 98 end 99 end 100 101 local elements = 102 { 103 [" "] = flushword, 104 ["<p>"] = flush, 105 ["<br>"] = flushpre, 106 ["<br/>"] = flushpre, 107 ["</h1>"] = flush, 108 ["</h2>"] = flush, 109 ["</h3>"] = flush, 110 ["</h4>"] = flush, 111 ["<h1>"] = function() flush() style = "H1" end, 112 ["<h2>"] = function() flush() style = "H2" end, 113 ["<h3>"] = function() flush() style = "H3" end, 114 ["<h4>"] = function() flush() style = "H4" end, 115 ["<li>"] = function() flush() style = "LB" end, 116 ["<i>"] = function() importer:style_on(ITALIC) end, 117 ["</i>"] = function() importer:style_off(ITALIC) end, 118 ["<em>"] = function() importer:style_on(ITALIC) end, 119 ["</em>"] = function() importer:style_off(ITALIC) end, 120 ["<u>"] = function() importer:style_on(UNDERLINE) end, 121 ["</u>"] = function() importer:style_off(UNDERLINE) end, 122 ["<b>"] = function() importer:style_on(BOLD) end, 123 ["</b>"] = function() importer:style_off(BOLD) end, 124 ["<pre>"] = function() flush() style = "PRE" pre = true end, 125 ["</pre>"] = function() flush() pre = false end, 126 ["\n"] = function() if pre then flush() style = "PRE" else flushword() end end 127 } 128 129 -- Actually do the parsing. 130 131 importer:reset() 132 for t in tokens do 133 local e = elements[t] 134 if e then 135 e() 136 elseif string_find(t, "^<") then 137 -- do nothing 138 elseif string_find(t, "^&") then 139 e = DecodeHTMLEntity(t) 140 if e then 141 importer:text(e) 142 end 143 else 144 importer:text(t) 145 end 146 end 147 flush() 148 149 return document 150end 151 152function Cmd.ImportHTMLFile(filename) 153 return ImportFileWithUI(filename, "Import HTML File", loadhtmlfile) 154end 155