From bba3e6024bc111cfe240194a9a7ed4e0d1680c1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gon=C3=A7alo?= <gon@ovalerio.net>
Date: Sat, 26 Mar 2022 18:24:12 +0000
Subject: [PATCH] calculate hashes of js and css code in element attributes

---
 inlinehashes/lib.py        | 162 +++++++++++++++++++++++++++++++++----
 tests/test_inlinehashes.py |  64 ++++++++++++---
 2 files changed, 200 insertions(+), 26 deletions(-)

diff --git a/inlinehashes/lib.py b/inlinehashes/lib.py
index a978b5b..24bf4b2 100644
--- a/inlinehashes/lib.py
+++ b/inlinehashes/lib.py
@@ -4,23 +4,20 @@ This small module helps you to parse HTML documents and extract all the inline
 content that must be specifically allowed in the Content-Security-Policy in
 order to work (assuming "unsafe-inline" is not present).
 """
-from typing import List
+from typing import List, Callable, Optional
 from dataclasses import dataclass
-from functools import cached_property
+from functools import cached_property, partial
 from itertools import chain
 import hashlib
 import base64
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
-_VALID_TARGETS = {
-    "scripts": [
-        {"name": "script"},
-    ],
-    "styles": [
-        {"name": "style"},
-    ],
-}
+
+@dataclass(frozen=True)
+class SearchQuery:
+    search_function: Callable
+    attr_name: Optional[str]
 
 
 @dataclass(frozen=True)
@@ -62,6 +59,137 @@ class Inline:
         return f"Inline(content='{self.short_content}...')"
 
 
+def matches_attribute(tag: Tag, attribute_name: str) -> bool:
+    return tag.has_attr(attribute_name)
+
+
+def matches_name(tag: Tag, name: str) -> bool:
+    return tag.name == name
+
+
+_EVENT_HANDLER_ATTRS = [
+    "onafterprint",
+    "onafterscriptexecute",
+    "onanimationcancel",
+    "onanimationend",
+    "onanimationiteration",
+    "onanimationstart",
+    "onauxclick",
+    "onbeforecopy",
+    "onbeforecut",
+    "onbeforeprint",
+    "onbeforescriptexecute",
+    "onbeforeunload",
+    "onbegin",
+    "onblur",
+    "onbounce",
+    "oncanplay",
+    "oncanplaythrough",
+    "onchange",
+    "onclick",
+    "onclose",
+    "oncontextmenu",
+    "oncopy",
+    "oncuechange",
+    "oncut",
+    "ondblclick",
+    "ondrag",
+    "ondragend",
+    "ondragenter",
+    "ondragleave",
+    "ondragover",
+    "ondragstart",
+    "ondrop",
+    "ondurationchange",
+    "onend",
+    "onended",
+    "onerror",
+    "onfocusin",
+    "onfocusout",
+    "onfullscreenchange",
+    "onhashchange",
+    "oninput",
+    "oninvalid",
+    "onkeydown",
+    "onkeypress",
+    "onkeyup",
+    "onload",
+    "onloadeddata",
+    "onloadedmetadata",
+    "onloadend",
+    "onloadstart",
+    "onmessage",
+    "onmousedown",
+    "onmouseenter",
+    "onmouseleave",
+    "onmousemove",
+    "onmouseout",
+    "onmouseover",
+    "onmouseup",
+    "onmousewheel",
+    "onmozfullscreenchange",
+    "onpagehide",
+    "onpageshow",
+    "onpaste",
+    "onpause",
+    "onplay",
+    "onplaying",
+    "onpointerdown",
+    "onpointerenter",
+    "onpointerleave",
+    "onpointermove",
+    "onpointerout",
+    "onpointerover",
+    "onpointerrawupdate",
+    "onpointerup",
+    "onpopstate",
+    "onprogress",
+    "onrepeat",
+    "onreset",
+    "onresize",
+    "onscroll",
+    "onsearch",
+    "onseeked",
+    "onseeking",
+    "onselect",
+    "onselectionchange",
+    "onselectstart",
+    "onshow",
+    "onstart",
+    "onsubmit",
+    "ontoggle",
+    "ontouchend",
+    "ontouchmove",
+    "ontouchstart",
+    "ontransitioncancel",
+    "ontransitionend",
+    "ontransitionrun",
+    "ontransitionstart",
+    "onunhandledrejection",
+    "onunload",
+    "onvolumechange",
+    "onwebkitanimationend",
+    "onwebkitanimationiteration",
+    "onwebkitanimationstart",
+    "onwebkittransitionend",
+    "onwheel",
+]
+
+_VALID_TARGETS = {
+    "scripts": [
+        SearchQuery(partial(matches_name, name="script"), None),
+        *[
+            SearchQuery(partial(matches_attribute, attribute_name=attr), attr)
+            for attr in _EVENT_HANDLER_ATTRS
+        ],
+    ],
+    "styles": [
+        SearchQuery(partial(matches_name, name="style"), None),
+        SearchQuery(partial(matches_attribute, attribute_name="style"), "style"),
+    ],
+}
+
+
 def parse(content: str, target: str = "all") -> List[Inline]:
     """Parses an HTML document and extracts."""
     soup = BeautifulSoup(content, "html.parser")
@@ -75,6 +203,12 @@ def parse(content: str, target: str = "all") -> List[Inline]:
 
     elements = []
     for q in search_queries:
-        elements += soup.find_all(**q)
-
-    return [Inline(e.contents[0]) for e in elements if e.contents]
+        for tag in soup.find_all(q.search_function):
+            if q.attr_name:
+                inline = Inline(tag[q.attr_name])
+            else:
+                if not tag.contents:
+                    continue
+                inline = Inline(tag.contents[0])
+            elements.append(inline)
+    return elements
diff --git a/tests/test_inlinehashes.py b/tests/test_inlinehashes.py
index dab2344..fc844a6 100644
--- a/tests/test_inlinehashes.py
+++ b/tests/test_inlinehashes.py
@@ -1,8 +1,8 @@
 import pytest
 
 
-from inlinehashes import __version__
-from inlinehashes.lib import Inline
+from inlinehashes import __version__, parse
+from inlinehashes.lib import Inline, _EVENT_HANDLER_ATTRS
 
 
 class TestInline:
@@ -105,22 +105,62 @@ class TestInline:
 
 
 class TestParse:
-    @pytest.mark.skip(reason="Add later")
     def test_parse_detects_script_tags(self):
-        pass
+        doc = """
+        <html>
+        <head><title>Some title</title></head>
+        <body>Some body
+        <script>alert("hash this");</script>
+        </body>
+        </html>
+        """
+        inlines = parse(doc)
+        assert len(inlines) == 1
+        assert inlines[0].content == 'alert("hash this");'
 
-    @pytest.mark.skip(reason="Add later")
     def test_parse_detects_style_tags(self):
-        pass
+        doc = """
+        <html>
+        <head>
+          <title>Some title</title>
+          <style>.someclass { background:#142a3f; }</style>
+        </head>
+        <body>Some body</body>
+        </html>
+        """
+        inlines = parse(doc)
+        assert len(inlines) == 1
+        assert inlines[0].content == ".someclass { background:#142a3f; }"
 
-    @pytest.mark.skip(reason="Not Implemented yet")
     def test_parse_detects_style_attributes(self):
-        pass
+        doc = """
+        <html>
+        <head>
+          <title>Some title</title>
+        </head>
+        <body style="text-color: #000;">Some body</body>
+        </html>
+        """
+        inlines = parse(doc)
+        assert len(inlines) == 1
+        assert inlines[0].content == "text-color: #000;"
 
-    @pytest.mark.skip(reason="Not Implemented yet")
-    def test_parse_detect_attributes_with_js(self):
-        pass
+    @pytest.mark.parametrize("attr", _EVENT_HANDLER_ATTRS)
+    def test_parse_detect_attributes_with_js(self, attr):
+        # Just to test they are detected even though some of them are
+        # not valid for all elements
+        doc = f"""
+        <html>
+        <head>
+          <title>Some title</title>
+        </head>
+        <body {attr}="alert(1);">Some body</body>
+        </html>
+        """
+        inlines = parse(doc)
+        assert len(inlines) == 1
+        assert inlines[0].content == "alert(1);"
 
 
 def test_version():
-    assert __version__ == "0.0.1"
+    assert __version__ == "0.0.2"