Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parsing robustness #8436

Closed
wants to merge 11 commits into from
63 changes: 27 additions & 36 deletions publisher/htmlElementsCollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,11 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
w.buff.WriteByte(b)

if !w.inQuote && b == '>' {
w.endCollecting()
break
s := w.buff.String()
if w.inPreTag == "" || w.parseEndTag(s) {
w.endCollecting()
break
}
}
}
}
Expand All @@ -151,31 +154,18 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
continue
}

if w.inPreTag != "" { // within preformatted code block
s := w.buff.String()
// Ignore everything inside a 'preformatted' tag. The tag its self is
// recorded already.
if w.inPreTag != "" {
w.inPreTag = ""
w.buff.Reset()
if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
w.inPreTag = ""
}
continue
}

// First check if we have processed this element before.
w.collector.mu.RLock()

// Work with the bytes slice as long as it's practical,
// to save memory allocations.
b := w.buff.Bytes()

// See https://github.com/dominikh/go-tools/issues/723
//lint:ignore S1030 This construct avoids memory allocation for the string.
seen := w.collector.elementSet[string(b)]
w.collector.mu.RUnlock()
if seen {
w.buff.Reset()
continue
}

// Filter out unwanted tags
// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
// comments and doctype tags
Expand Down Expand Up @@ -242,28 +232,26 @@ func parseStartTag(s string) (string, bool) {
s = strings.TrimPrefix(s, "<")
s = strings.TrimSuffix(s, ">")

spaceIndex := strings.Index(s, " ")
if spaceIndex != -1 {
s = s[:spaceIndex]
}
fields := strings.Fields(s);
tag := fields[0];

return strings.ToLower(strings.TrimSpace(s)), true
return strings.ToLower(tag), true
}

func parseEndTag(s string) (string, bool) {
if !strings.HasPrefix(s, "</") {
return "", false
func (c *htmlElementsCollectorWriter) parseEndTag(s string) bool {
slen := len(s)
if slen > 83 {
// Avoid processing big amounts with ToLower().
s = s[slen - 83:]
}

s = strings.TrimPrefix(s, "</")
s = strings.TrimSuffix(s, ">")

return strings.ToLower(strings.TrimSpace(s)), true
s = strings.ToLower(s)
return strings.HasSuffix(s, "</"+c.inPreTag+">")
}

// No need to look inside these for HTML elements.
// No need to look inside these for HTML elements. These can contain arbitrary
// code which could be misunderstood as HTML.
func isPreFormatted(s string) bool {
return s == "pre" || s == "textarea" || s == "script" || s == "style"
return s == "pre" || s == "textarea" || s == "script" || s == "style" || s == "title" || s == "canvas"
}

type htmlElement struct {
Expand Down Expand Up @@ -299,7 +287,10 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
// and pretend it's a <div>.
if exceptionList[tagName] {
tagBuffer = tagName
elStr = strings.Replace(elStr, tagName, "div", 1)

fields := strings.Fields(elStr);
fields[0] = "<div"
elStr = strings.Join(fields, " ")
}

n, err := html.Parse(strings.NewReader(elStr))
Expand All @@ -308,7 +299,7 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
}
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
if n.Type == html.ElementNode && strings.Contains(strings.ToLower(elStr), n.Data) {
el.Tag = n.Data

for _, a := range n.Attr {
Expand Down
13 changes: 11 additions & 2 deletions publisher/htmlElementsCollector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ func TestClassCollector(t *testing.T) {
<thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
<tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
</table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
{"thead uppercase", `<TABLE class="CL1">
<THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
<TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
// https://github.com/gohugoio/hugo/issues/7161
{"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
{"AlpineJS bind 1", `<body>
Expand All @@ -78,7 +82,7 @@ func TestClassCollector(t *testing.T) {
f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", ""),
},
{"AlpineJS bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"AlpineJS bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
{"AlpineJS bind 4", `<div x-bind:class="{ 'text-gray-800': !checked,
'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")},
{"AlpineJS bind 5", `<a x-bind:class="{
'text-a': a && b,
Expand All @@ -89,14 +93,19 @@ func TestClassCollector(t *testing.T) {
}" class="block w-36 cursor-pointer pr-3 no-underline capitalize"></a>`, f("a", "block capitalize cursor-pointer no-underline pl-2 pl-3 pr-3 text-a text-b text-gray-600 w-36", "")},
{"AlpineJS transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
{"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
{"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
// Issue #7746
{"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
// Issue #7567
{"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
{"Style tags content should be skipped", `<style>p{color: red;font-size: 20px;}</style><div class="foo"></div>`, f("div style", "foo", "")},
{"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
{"A < in a preformatted tag content shouldn't mess up", `<script>if (a < b) { doNothing(); }</script><div class="foo"></div>`, f("div script", "foo", "")},
{"Preformatted content with <, twice", `<script>r = a < b;</script><div class="foo"></div><script>r = a < b;</script><div class="foo"></div>`, f("div script", "foo", "")},
{"JS text named like a tag content shouldn't mess up", `<script>r = a < b; //script>2</script><div class="foo"></div>`, f("div script", "foo", "")},
{"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
{"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
{"DOCTYPE should be skipped", `<!DOCTYPE html>`, f("", "", "")},
{"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
// Issue #8417
{"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
Expand Down