-- LUA HTML parser
-- Original: https://gist.github.com/exebetche/6126573
-- Adapted by Andrey Gavrilov
local parser = {}
local empty_tags = {
br = true,
hr = true,
img = true,
embed = true,
param = true,
area = true,
col = true,
input = true,
meta = true,
link = true,
base = true,
basefont = true,
frame = true,
isindex = true
}
-- omittable tags siblings
-- if an open tag from the primary entry follow
-- an unclosed tag of the secondary,
-- the secondary is automatically closed
-- See http://www.w3.org/TR/html5/syntax.html#optional-tags
local omittable_tags = {
tbody = {
thead = true,
tbody = true,
tfoot = true
},
thead = {
thead = true,
tbody = true,
tfoot = true
},
tfoot = {
thead = true,
tbody = true,
tfoot = true
},
td = {
td = true,
th = true
},
th = {
td = true,
th = true
},
tr = {
tr = true
},
dd = {
dd = true,
dt = true
},
dt = {
dd = true,
dt = true
},
optgroup = {
optgroup = true,
option = true
},
optgroup = {
optgroup = true,
option = true
},
address = { p = true},
article = { p = true},
aside = { p = true},
blockquote = { p = true},
dir = { p = true},
div = { p = true},
dl = { p = true},
fieldset = { p = true},
footer = { p = true},
form = { p = true},
h1 = { p = true},
h2 = { p = true},
h3 = { p = true},
h4 = { p = true},
h5 = { p = true},
h6 = { p = true},
header = { p = true},
hgroup = { p = true},
hr = { p = true},
menu = { p = true},
nav = { p = true},
ol = { p = true},
p = { p = true},
pre = { p = true},
section = { p = true},
table = { p = true},
ul= { p = true}
}
-- omittable tags children
local omittable_tags2 = {
table = {
tr = true,
td = true,
p = true,
},
tr = {
td = true,
p = true
},
td = {
p = true
}
}
function parser.parse(data, lazy)
local tree = {}
local stack = {}
local level = 0
local new_level = 0
table.insert(stack, tree)
local node
local lower_tag
local script_open = false
local script_val = ""
local script_node = nil
local tag_match = ""
lazy = lazy or false
for b, op, tag, attr, op2, bl1, val, bl2 in string.gmatch(
data,
"(<)(%/?!?)([%w:_-'\\\"%[]+)(.-)(%/?%-?)>"..
"([%s\r\n\t]*)([^<]*)([%s\r\n\t]*)"
) do
lower_tag = string.lower(tag)
if script_open then
if lower_tag == "script" and op == "/" then
node.childNodes[1].value = string.gsub(script_val, "^ 0
and omittable_tags2[lower_tag][stack[level][#stack[level]].tagName]
do
print("Auto closing "..
stack[level][#stack[level]].tagName..
" followed by ending "..lower_tag)
level = level - 1
table.remove(stack)
end
if level==0 then return tree end
if lower_tag ~= stack[level][#stack[level]].tagName
then
print("Mismatch: "..lower_tag..
", (has "..stack[level][#stack[level]].tagName..")")
end
level = level - 1
table.remove(stack)
else
level = level + 1
node = nil
node = {}
node.tagName = lower_tag
node.childNodes = {}
if attr ~= "" then
node.attr = {}
for n, v in string.gmatch(
attr,
"%s([^%s=]+)=\"([^\"]+)\""
) do
node.attr[n] = string.gsub(v, '"', '[^\\]\\"')
end
for n, v in string.gmatch(
attr,
"%s([^%s=]+)='([^']+)'"
) do
node.attr[n] = string.gsub(v, '"', '[^\\]\\"')
end
end
if lower_tag == "script"
and node.attr
and not node.attr["src"]
then
script_val = bl1..val..bl2
table.insert(node.childNodes, {
tagName = "textNode",
value = ""
})
table.insert(stack[level], node)
script_open = true
else
-- Check if the previous sibling element end tag has been omitted
-- and should be close automatically
if not lazy
and omittable_tags[lower_tag]
and level > 1
and stack[level-1]
and #stack[level-1] > 0
and omittable_tags[lower_tag][stack[level-1][#stack[level-1]].tagName] == true
then
print("Auto closing "..
stack[level-1][#stack[level-1]].tagName..
" followed by "..lower_tag)
level = level - 1
table.remove(stack)
if level==0 then return tree end
end
table.insert(stack[level], node)
if empty_tags[lower_tag] then
if val ~= "" then
table.insert(stack[level], {
tagName = "textNode",
value = val
})
end
node.childNodes = nil
level = level - 1
else
if val ~= "" then
table.insert(node.childNodes, {
tagName = "textNode",
value = val
})
end
table.insert(stack, node.childNodes)
end
end
end
end
return tree
end
return parser