calculate hashes of js and css code in element attributes

This commit is contained in:
Gonçalo Valério 2022-03-26 18:24:12 +00:00
parent fa40e7323d
commit bba3e6024b
Signed by: dethos
GPG Key ID: DF557F2BDCC2445E
2 changed files with 200 additions and 26 deletions

View File

@ -4,23 +4,20 @@ This small module helps you to parse HTML documents and extract all the inline
content that must be specifically allowed in the Content-Security-Policy in
order to work (assuming "unsafe-inline" is not present).
"""
from typing import List
from typing import List, Callable, Optional
from dataclasses import dataclass
from functools import cached_property
from functools import cached_property, partial
from itertools import chain
import hashlib
import base64
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
_VALID_TARGETS = {
"scripts": [
{"name": "script"},
],
"styles": [
{"name": "style"},
],
}
@dataclass(frozen=True)
class SearchQuery:
search_function: Callable
attr_name: Optional[str]
@dataclass(frozen=True)
@ -62,6 +59,137 @@ class Inline:
return f"Inline(content='{self.short_content}...')"
def matches_attribute(tag: Tag, attribute_name: str) -> bool:
return tag.has_attr(attribute_name)
def matches_name(tag: Tag, name: str) -> bool:
return tag.name == name
_EVENT_HANDLER_ATTRS = [
"onafterprint",
"onafterscriptexecute",
"onanimationcancel",
"onanimationend",
"onanimationiteration",
"onanimationstart",
"onauxclick",
"onbeforecopy",
"onbeforecut",
"onbeforeprint",
"onbeforescriptexecute",
"onbeforeunload",
"onbegin",
"onblur",
"onbounce",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncontextmenu",
"oncopy",
"oncuechange",
"oncut",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onend",
"onended",
"onerror",
"onfocusin",
"onfocusout",
"onfullscreenchange",
"onhashchange",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadend",
"onloadstart",
"onmessage",
"onmousedown",
"onmouseenter",
"onmouseleave",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onmozfullscreenchange",
"onpagehide",
"onpageshow",
"onpaste",
"onpause",
"onplay",
"onplaying",
"onpointerdown",
"onpointerenter",
"onpointerleave",
"onpointermove",
"onpointerout",
"onpointerover",
"onpointerrawupdate",
"onpointerup",
"onpopstate",
"onprogress",
"onrepeat",
"onreset",
"onresize",
"onscroll",
"onsearch",
"onseeked",
"onseeking",
"onselect",
"onselectionchange",
"onselectstart",
"onshow",
"onstart",
"onsubmit",
"ontoggle",
"ontouchend",
"ontouchmove",
"ontouchstart",
"ontransitioncancel",
"ontransitionend",
"ontransitionrun",
"ontransitionstart",
"onunhandledrejection",
"onunload",
"onvolumechange",
"onwebkitanimationend",
"onwebkitanimationiteration",
"onwebkitanimationstart",
"onwebkittransitionend",
"onwheel",
]
_VALID_TARGETS = {
"scripts": [
SearchQuery(partial(matches_name, name="script"), None),
*[
SearchQuery(partial(matches_attribute, attribute_name=attr), attr)
for attr in _EVENT_HANDLER_ATTRS
],
],
"styles": [
SearchQuery(partial(matches_name, name="style"), None),
SearchQuery(partial(matches_attribute, attribute_name="style"), "style"),
],
}
def parse(content: str, target: str = "all") -> List[Inline]:
"""Parses an HTML document and extracts."""
soup = BeautifulSoup(content, "html.parser")
@ -75,6 +203,12 @@ def parse(content: str, target: str = "all") -> List[Inline]:
elements = []
for q in search_queries:
elements += soup.find_all(**q)
return [Inline(e.contents[0]) for e in elements if e.contents]
for tag in soup.find_all(q.search_function):
if q.attr_name:
inline = Inline(tag[q.attr_name])
else:
if not tag.contents:
continue
inline = Inline(tag.contents[0])
elements.append(inline)
return elements

View File

@ -1,8 +1,8 @@
import pytest
from inlinehashes import __version__
from inlinehashes.lib import Inline
from inlinehashes import __version__, parse
from inlinehashes.lib import Inline, _EVENT_HANDLER_ATTRS
class TestInline:
@ -105,22 +105,62 @@ class TestInline:
class TestParse:
@pytest.mark.skip(reason="Add later")
def test_parse_detects_script_tags(self):
pass
doc = """
<html>
<head><title>Some title</title></head>
<body>Some body
<script>alert("hash this");</script>
</body>
</html>
"""
inlines = parse(doc)
assert len(inlines) == 1
assert inlines[0].content == 'alert("hash this");'
@pytest.mark.skip(reason="Add later")
def test_parse_detects_style_tags(self):
pass
doc = """
<html>
<head>
<title>Some title</title>
<style>.someclass { background:#142a3f; }</style>
</head>
<body>Some body</body>
</html>
"""
inlines = parse(doc)
assert len(inlines) == 1
assert inlines[0].content == ".someclass { background:#142a3f; }"
@pytest.mark.skip(reason="Not Implemented yet")
def test_parse_detects_style_attributes(self):
pass
doc = """
<html>
<head>
<title>Some title</title>
</head>
<body style="text-color: #000;">Some body</body>
</html>
"""
inlines = parse(doc)
assert len(inlines) == 1
assert inlines[0].content == "text-color: #000;"
@pytest.mark.skip(reason="Not Implemented yet")
def test_parse_detect_attributes_with_js(self):
pass
@pytest.mark.parametrize("attr", _EVENT_HANDLER_ATTRS)
def test_parse_detect_attributes_with_js(self, attr):
# Just to test they are detected even though some of them are
# not valid for all elements
doc = f"""
<html>
<head>
<title>Some title</title>
</head>
<body {attr}="alert(1);">Some body</body>
</html>
"""
inlines = parse(doc)
assert len(inlines) == 1
assert inlines[0].content == "alert(1);"
def test_version():
assert __version__ == "0.0.1"
assert __version__ == "0.0.2"