"""Inline Hashes - Helping with CSP when possible. This small module helps you to parse HTML documents and extract all the inline content that must be specifically allowed in the Content-Security-Policy in order to work (assuming "unsafe-inline" is not present). """ from typing import List from dataclasses import dataclass from functools import cached_property from itertools import chain import hashlib import base64 from bs4 import BeautifulSoup _VALID_TARGETS = { "scripts": [ {"name": "script"}, ], "styles": [ {"name": "style"}, ], } @dataclass(frozen=True) class Inline: """Represents a piece of content present in the HTML document. It can be the value of an element/node or the value of an attribute of a given element/node. """ content: str @cached_property def short_content(self) -> str: return self.content[:50] @cached_property def sha256(self) -> str: h = hashlib.sha256(self.content.encode("utf-8")) h_b64 = base64.b64encode(h.digest()).decode("utf8") return f"sha256-{h_b64}" @cached_property def sha384(self) -> str: h = hashlib.sha384(self.content.encode("utf-8")) h_b64 = base64.b64encode(h.digest()).decode("utf8") return f"sha384-{h_b64}" @cached_property def sha512(self) -> str: h = hashlib.sha512(self.content.encode("utf-8")) h_b64 = base64.b64encode(h.digest()).decode("utf8") return f"sha512-{h_b64}" def __repr__(self) -> str: return f"Inline(content='{self.content}')" def __str__(self) -> str: return f"Inline(content='{self.short_content}...')" def parse(content: str, target: str = "all") -> List[Inline]: """Parses an HTML document and extracts.""" soup = BeautifulSoup(content, "html.parser") if target == "all": search_queries = chain(*_VALID_TARGETS.values()) elif target in _VALID_TARGETS.keys(): search_queries = _VALID_TARGETS[target] else: raise ValueError("Invalid Target") elements = [] for q in search_queries: elements += soup.find_all(**q) return [Inline(e.contents[0]) for e in elements if e.contents]