From bba3e6024bc111cfe240194a9a7ed4e0d1680c1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gon=C3=A7alo?= Date: Sat, 26 Mar 2022 18:24:12 +0000 Subject: [PATCH] calculate hashes of js and css code in element attributes --- inlinehashes/lib.py | 162 +++++++++++++++++++++++++++++++++---- tests/test_inlinehashes.py | 64 ++++++++++++--- 2 files changed, 200 insertions(+), 26 deletions(-) diff --git a/inlinehashes/lib.py b/inlinehashes/lib.py index a978b5b..24bf4b2 100644 --- a/inlinehashes/lib.py +++ b/inlinehashes/lib.py @@ -4,23 +4,20 @@ This small module helps you to parse HTML documents and extract all the inline content that must be specifically allowed in the Content-Security-Policy in order to work (assuming "unsafe-inline" is not present). """ -from typing import List +from typing import List, Callable, Optional from dataclasses import dataclass -from functools import cached_property +from functools import cached_property, partial from itertools import chain import hashlib import base64 -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag -_VALID_TARGETS = { - "scripts": [ - {"name": "script"}, - ], - "styles": [ - {"name": "style"}, - ], -} + +@dataclass(frozen=True) +class SearchQuery: + search_function: Callable + attr_name: Optional[str] @dataclass(frozen=True) @@ -62,6 +59,137 @@ class Inline: return f"Inline(content='{self.short_content}...')" +def matches_attribute(tag: Tag, attribute_name: str) -> bool: + return tag.has_attr(attribute_name) + + +def matches_name(tag: Tag, name: str) -> bool: + return tag.name == name + + +_EVENT_HANDLER_ATTRS = [ + "onafterprint", + "onafterscriptexecute", + "onanimationcancel", + "onanimationend", + "onanimationiteration", + "onanimationstart", + "onauxclick", + "onbeforecopy", + "onbeforecut", + "onbeforeprint", + "onbeforescriptexecute", + "onbeforeunload", + "onbegin", + "onblur", + "onbounce", + "oncanplay", + "oncanplaythrough", + "onchange", + "onclick", + "onclose", + "oncontextmenu", + "oncopy", + "oncuechange", + "oncut", + "ondblclick", + "ondrag", + "ondragend", + "ondragenter", + "ondragleave", + "ondragover", + "ondragstart", + "ondrop", + "ondurationchange", + "onend", + "onended", + "onerror", + "onfocusin", + "onfocusout", + "onfullscreenchange", + "onhashchange", + "oninput", + "oninvalid", + "onkeydown", + "onkeypress", + "onkeyup", + "onload", + "onloadeddata", + "onloadedmetadata", + "onloadend", + "onloadstart", + "onmessage", + "onmousedown", + "onmouseenter", + "onmouseleave", + "onmousemove", + "onmouseout", + "onmouseover", + "onmouseup", + "onmousewheel", + "onmozfullscreenchange", + "onpagehide", + "onpageshow", + "onpaste", + "onpause", + "onplay", + "onplaying", + "onpointerdown", + "onpointerenter", + "onpointerleave", + "onpointermove", + "onpointerout", + "onpointerover", + "onpointerrawupdate", + "onpointerup", + "onpopstate", + "onprogress", + "onrepeat", + "onreset", + "onresize", + "onscroll", + "onsearch", + "onseeked", + "onseeking", + "onselect", + "onselectionchange", + "onselectstart", + "onshow", + "onstart", + "onsubmit", + "ontoggle", + "ontouchend", + "ontouchmove", + "ontouchstart", + "ontransitioncancel", + "ontransitionend", + "ontransitionrun", + "ontransitionstart", + "onunhandledrejection", + "onunload", + "onvolumechange", + "onwebkitanimationend", + "onwebkitanimationiteration", + "onwebkitanimationstart", + "onwebkittransitionend", + "onwheel", +] + +_VALID_TARGETS = { + "scripts": [ + SearchQuery(partial(matches_name, name="script"), None), + *[ + SearchQuery(partial(matches_attribute, attribute_name=attr), attr) + for attr in _EVENT_HANDLER_ATTRS + ], + ], + "styles": [ + SearchQuery(partial(matches_name, name="style"), None), + SearchQuery(partial(matches_attribute, attribute_name="style"), "style"), + ], +} + + def parse(content: str, target: str = "all") -> List[Inline]: """Parses an HTML document and extracts.""" soup = BeautifulSoup(content, "html.parser") @@ -75,6 +203,12 @@ def parse(content: str, target: str = "all") -> List[Inline]: elements = [] for q in search_queries: - elements += soup.find_all(**q) - - return [Inline(e.contents[0]) for e in elements if e.contents] + for tag in soup.find_all(q.search_function): + if q.attr_name: + inline = Inline(tag[q.attr_name]) + else: + if not tag.contents: + continue + inline = Inline(tag.contents[0]) + elements.append(inline) + return elements diff --git a/tests/test_inlinehashes.py b/tests/test_inlinehashes.py index dab2344..fc844a6 100644 --- a/tests/test_inlinehashes.py +++ b/tests/test_inlinehashes.py @@ -1,8 +1,8 @@ import pytest -from inlinehashes import __version__ -from inlinehashes.lib import Inline +from inlinehashes import __version__, parse +from inlinehashes.lib import Inline, _EVENT_HANDLER_ATTRS class TestInline: @@ -105,22 +105,62 @@ class TestInline: class TestParse: - @pytest.mark.skip(reason="Add later") def test_parse_detects_script_tags(self): - pass + doc = """ + + Some title + Some body + + + + """ + inlines = parse(doc) + assert len(inlines) == 1 + assert inlines[0].content == 'alert("hash this");' - @pytest.mark.skip(reason="Add later") def test_parse_detects_style_tags(self): - pass + doc = """ + + + Some title + + + Some body + + """ + inlines = parse(doc) + assert len(inlines) == 1 + assert inlines[0].content == ".someclass { background:#142a3f; }" - @pytest.mark.skip(reason="Not Implemented yet") def test_parse_detects_style_attributes(self): - pass + doc = """ + + + Some title + + Some body + + """ + inlines = parse(doc) + assert len(inlines) == 1 + assert inlines[0].content == "text-color: #000;" - @pytest.mark.skip(reason="Not Implemented yet") - def test_parse_detect_attributes_with_js(self): - pass + @pytest.mark.parametrize("attr", _EVENT_HANDLER_ATTRS) + def test_parse_detect_attributes_with_js(self, attr): + # Just to test they are detected even though some of them are + # not valid for all elements + doc = f""" + + + Some title + + Some body + + """ + inlines = parse(doc) + assert len(inlines) == 1 + assert inlines[0].content == "alert(1);" def test_version(): - assert __version__ == "0.0.1" + assert __version__ == "0.0.2"