diff --git a/Cargo.lock b/Cargo.lock index 0e8fb33..1131478 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -747,6 +747,16 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" +[[package]] +name = "sanitize_html" +version = "0.7.0" +dependencies = [ + "html5ever", + "kuchiki", + "lazy_static", + "regex", +] + [[package]] name = "scopeguard" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index 142936f..9fae8e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,4 +2,5 @@ members = [ "bin", -] \ No newline at end of file + "sanitize-html-rs", +] diff --git a/sanitize-html-rs/.github/workflows/build.yml b/sanitize-html-rs/.github/workflows/build.yml new file mode 100644 index 0000000..893a284 --- /dev/null +++ b/sanitize-html-rs/.github/workflows/build.yml @@ -0,0 +1,31 @@ +name: Build + +on: [push, pull_request] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - macOS-latest + - windows-latest + rust: + - stable + + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + override: true + - name: Build + run: | + cargo build --all-targets --no-default-features --verbose + cargo build --all-targets --verbose + - name: Run tests + run: cargo test --all-targets --verbose + env: + RUST_BACKTRACE: 1 diff --git a/sanitize-html-rs/.github/workflows/coverage.yml b/sanitize-html-rs/.github/workflows/coverage.yml new file mode 100644 index 0000000..50e5624 --- /dev/null +++ b/sanitize-html-rs/.github/workflows/coverage.yml @@ -0,0 +1,27 @@ +name: Coverage + +on: + pull_request: + push: + branches: + - master + +jobs: + coverage: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - uses: actions-rs/install@v0.1 + with: + crate: cargo-tarpaulin + use-tool-cache: true + - name: Run coverage + run: cargo tarpaulin -f -t 5 --out Xml -v -- --test-threads=1 + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + token: ${{secrets.CODECOV_TOKEN}} diff --git a/sanitize-html-rs/.github/workflows/style.yml b/sanitize-html-rs/.github/workflows/style.yml new file mode 100644 index 0000000..a46893f --- /dev/null +++ b/sanitize-html-rs/.github/workflows/style.yml @@ -0,0 +1,24 @@ +name: Style check + +on: [push, pull_request] + +jobs: + clippy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install clippy + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: clippy + - uses: actions-rs/clippy-check@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + args: --all --all-features + fmt: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: Run fmt check + run: cargo fmt --all -- --check diff --git a/sanitize-html-rs/.gitignore b/sanitize-html-rs/.gitignore new file mode 100644 index 0000000..c97f83a --- /dev/null +++ b/sanitize-html-rs/.gitignore @@ -0,0 +1,4 @@ +/target/ +**/*.rs.bk +Cargo.lock +/.vscode diff --git a/sanitize-html-rs/Cargo.toml b/sanitize-html-rs/Cargo.toml new file mode 100644 index 0000000..ee25582 --- /dev/null +++ b/sanitize-html-rs/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "sanitize_html" +version = "0.7.0" +authors = ["Andrey Kutejko "] +description = "Rule-based HTML Sanitization library" +keywords = ["html", "sanitize"] +license = "MIT" +homepage = "https://github.com/andy128k/sanitize-html-rs" +repository = "https://github.com/andy128k/sanitize-html-rs.git" +edition = "2018" + +[dependencies] +regex = "1" +lazy_static = "1" +html5ever = "0.25" +kuchiki = "0.8" diff --git a/sanitize-html-rs/LICENSE.txt b/sanitize-html-rs/LICENSE.txt new file mode 100644 index 0000000..d193d12 --- /dev/null +++ b/sanitize-html-rs/LICENSE.txt @@ -0,0 +1,18 @@ +Copyright (c) 2017 Andrey Kutejko + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/sanitize-html-rs/README.md b/sanitize-html-rs/README.md new file mode 100644 index 0000000..3dcfedd --- /dev/null +++ b/sanitize-html-rs/README.md @@ -0,0 +1,8 @@ +# Sanitize HTML + +[![Crates.io Status](https://img.shields.io/crates/v/sanitize_html.svg)](https://crates.io/crates/sanitize_html) +[![Build](https://github.com/andy128k/sanitize-html-rs/workflows/Build/badge.svg?branch=master&event=push)](https://github.com/andy128k/sanitize-html-rs/actions?query=workflow%3ABuild) +[![codecov](https://codecov.io/gh/andy128k/sanitize-html-rs/branch/master/graph/badge.svg)](https://codecov.io/gh/andy128k/sanitize-html-rs) +[![dependency status](https://deps.rs/repo/github/andy128k/sanitize-html-rs/status.svg)](https://deps.rs/repo/github/andy128k/sanitize-html-rs) + +This is a library for sanitization of HTML fragments. diff --git a/sanitize-html-rs/src/errors.rs b/sanitize-html-rs/src/errors.rs new file mode 100644 index 0000000..2477a07 --- /dev/null +++ b/sanitize-html-rs/src/errors.rs @@ -0,0 +1,37 @@ +//! Error types, which can be emited by sanitization procedure. + +use std::error::Error; +use std::fmt; + +/// Sanitization error +#[derive(Debug)] +pub enum SanitizeError { + /// UTF-8 decoding error + StrUtf8Error(std::str::Utf8Error), + + /// UTF-8 decoding error + Utf8Error(std::string::FromUtf8Error), + + /// Serialization error + SerializeError(std::io::Error), +} + +impl fmt::Display for SanitizeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SanitizeError::StrUtf8Error(e) => write!(f, "UTF-8 decode error {}", e), + SanitizeError::Utf8Error(e) => write!(f, "UTF-8 decode error {}", e), + SanitizeError::SerializeError(e) => write!(f, "Serialization error {}", e), + } + } +} + +impl Error for SanitizeError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + SanitizeError::StrUtf8Error(e) => Some(e), + SanitizeError::Utf8Error(e) => Some(e), + SanitizeError::SerializeError(e) => Some(e), + } + } +} diff --git a/sanitize-html-rs/src/lib.rs b/sanitize-html-rs/src/lib.rs new file mode 100644 index 0000000..e4797e7 --- /dev/null +++ b/sanitize-html-rs/src/lib.rs @@ -0,0 +1,42 @@ +//! HTML Sanitization library +//! +//! # Examples +//! +//! ``` +//! use sanitize_html::sanitize_str; +//! use sanitize_html::rules::predefined::DEFAULT; +//! +//! let input = "Lorem ipsum dolor sit
amet "; +//! +//! let sanitized_default: String = sanitize_str(&DEFAULT, input).unwrap(); +//! assert_eq!(&sanitized_default, "Lorem ipsum dolor sit amet "); +//! ``` + +#![deny(missing_docs)] + +pub mod errors; +mod parse; +pub mod rules; +mod sanitize; +mod tests; + +use crate::errors::SanitizeError; +use crate::rules::Rules; + +/// Sanitize HTML bytes +pub fn sanitize_bytes(rules: &Rules, input: &[u8]) -> Result, SanitizeError> { + let input_str = std::str::from_utf8(input).map_err(SanitizeError::StrUtf8Error)?; + let dom = parse::parse_str(input_str); + let new_dom = sanitize::sanitize_dom(&dom, rules); + let result_bytes = parse::unparse_bytes(&new_dom)?; + Ok(result_bytes) +} + +/// Sanitize HTML string +pub fn sanitize_str(rules: &Rules, input: &str) -> Result { + let dom = parse::parse_str(input); + let new_dom = sanitize::sanitize_dom(&dom, rules); + let result_bytes = parse::unparse_bytes(&new_dom)?; + let result_string = String::from_utf8(result_bytes).map_err(SanitizeError::Utf8Error)?; + Ok(result_string) +} diff --git a/sanitize-html-rs/src/parse.rs b/sanitize-html-rs/src/parse.rs new file mode 100644 index 0000000..41caba4 --- /dev/null +++ b/sanitize-html-rs/src/parse.rs @@ -0,0 +1,38 @@ +use super::errors::SanitizeError; +use html5ever::{ + interface::QualName, + local_name, namespace_prefix, namespace_url, ns, serialize, + serialize::{SerializeOpts, TraversalScope}, + tendril::TendrilSink, +}; +use kuchiki::{parse_html_with_options, NodeRef, ParseOpts}; +use std::default::Default; + +pub(crate) fn parse_str(input: &str) -> NodeRef { + let mut opts = ParseOpts::default(); + opts.tree_builder.drop_doctype = true; + + let mut parser = parse_html_with_options(opts); + parser.process(input.into()); + parser.finish() +} + +pub(crate) fn unparse_bytes(dom: &NodeRef) -> Result, SanitizeError> { + let mut buf: Vec = Vec::new(); + + let parent = QualName::new( + Some(namespace_prefix!("html")), + ns!(html), + local_name!("div"), + ); + + let opts = SerializeOpts { + scripting_enabled: false, + traversal_scope: TraversalScope::ChildrenOnly(Some(parent)), + create_missing_parent: false, + }; + + serialize(&mut buf, dom, opts).map_err(SanitizeError::SerializeError)?; + + Ok(buf) +} diff --git a/sanitize-html-rs/src/rules/mod.rs b/sanitize-html-rs/src/rules/mod.rs new file mode 100644 index 0000000..775382b --- /dev/null +++ b/sanitize-html-rs/src/rules/mod.rs @@ -0,0 +1,104 @@ +//! Structures to define sanitization rules. + +pub mod pattern; +pub mod predefined; + +use self::pattern::Pattern; +use std::collections::HashMap; +use std::collections::HashSet; + +/// structure to describe HTML element +pub struct Element { + /// name of an element + pub name: String, + /// Whitelist of allowed attributes + pub attributes: HashMap, + /// List of mandatory atributes and their values. + /// These attributes will be forcibly added to element. + pub mandatory_attributes: HashMap, +} + +impl Element { + /// Creates element descriptor + pub fn new(name: &str) -> Self { + Self { + name: name.to_owned(), + attributes: HashMap::new(), + mandatory_attributes: HashMap::new(), + } + } + + /// Adds an attribute + pub fn attribute(mut self, attribute: &str, pattern: Pattern) -> Self { + self.attributes.insert(attribute.to_owned(), pattern); + self + } + + /// Adds mandatory attribute + pub fn mandatory_attribute(mut self, attribute: &str, value: &str) -> Self { + self.mandatory_attributes + .insert(attribute.to_owned(), value.to_owned()); + self + } + + /// Checks if attribute is valid + pub fn is_valid(&self, attribute: &str, value: &str) -> bool { + match self.attributes.get(attribute) { + None => false, + Some(pattern) => pattern.matches(value), + } + } +} + +/// structure to describe sanitization rules +#[derive(Default)] +pub struct Rules { + /// Determines if comments are kept of stripped out of a document. + pub allow_comments: bool, + /// Allowed elements. + pub allowed_elements: HashMap, + /// Elements which will be removed together with their children. + pub delete_elements: HashSet, + /// Elements which will be replaced by spaces (Their children will be processed recursively). + pub space_elements: HashSet, + /// Elements which will be renamed. + pub rename_elements: HashMap, +} + +impl Rules { + /// Creates a new rules set. + pub fn new() -> Self { + Self::default() + } + + /// Sets if comments are allowed + pub fn allow_comments(mut self, allow_comments: bool) -> Self { + self.allow_comments = allow_comments; + self + } + + /// Adds a rule for an allowed element + pub fn element(mut self, element: Element) -> Self { + self.allowed_elements.insert(element.name.clone(), element); + self + } + + /// Adds a rule to delete an element + pub fn delete(mut self, element_name: &str) -> Self { + self.delete_elements.insert(element_name.to_owned()); + self + } + + /// Adds a rule to replace an element with space + pub fn space(mut self, element_name: &str) -> Self { + self.space_elements.insert(element_name.to_owned()); + self + } + + /// Adds a rule to rename an element + pub fn rename(mut self, element_name: &str, to: &str) -> Self { + self.rename_elements + .insert(element_name.to_owned(), to.to_owned()); + self + } +} diff --git a/sanitize-html-rs/src/rules/pattern.rs b/sanitize-html-rs/src/rules/pattern.rs new file mode 100644 index 0000000..837a061 --- /dev/null +++ b/sanitize-html-rs/src/rules/pattern.rs @@ -0,0 +1,127 @@ +//! This module contains code dedicated to check validity of attribute's value. +//! +//! # Examples +//! ``` +//! use sanitize_html::rules::pattern::Pattern; +//! use regex::Regex; +//! +//! let href = Pattern::regex(Regex::new("^(ftp:|http:|https:|mailto:)").unwrap()) | +//! !Pattern::regex(Regex::new("^[^/]+[[:space:]]*:").unwrap()); +//! +//! assert!(href.matches("filename.xls")); +//! assert!(href.matches("http://foo.com/")); +//! assert!(href.matches(" filename with spaces .zip ")); +//! assert!(!href.matches(" javascript : window.location = '//example.com/'")); // Attempt to make XSS +//! ``` + +use regex::Regex; + +/// Value pattern +pub struct Pattern(pub Box bool + Sync + Send>); + +impl Pattern { + /// Creates pattern which accepts any value. + /// + /// # Example + /// ``` + /// use sanitize_html::rules::pattern::Pattern; + /// use regex::Regex; + /// + /// let pattern = Pattern::any(); + /// assert!(pattern.matches("")); + /// assert!(pattern.matches("pants")); + /// ``` + pub fn any() -> Self { + Pattern(Box::new(move |_value| true)) + } + + /// Creates pattern which uses regular expression to check a value. Panics + /// + /// # Example + /// ``` + /// use sanitize_html::rules::pattern::Pattern; + /// use regex::Regex; + /// + /// let pattern = Pattern::regex(Regex::new("ant").unwrap()); + /// assert!(!pattern.matches("")); + /// assert!(pattern.matches("pants")); + /// ``` + pub fn regex(re: Regex) -> Self { + Pattern(Box::new(move |value| re.is_match(value))) + } + + /// Checks if a value matches to a pattern. + pub fn matches(&self, value: &str) -> bool { + (self.0)(value) + } +} + +impl ::std::ops::Not for Pattern { + type Output = Pattern; + + /// Negates pattern + /// + /// # Example + /// ``` + /// use sanitize_html::rules::pattern::Pattern; + /// use regex::Regex; + /// + /// let pattern = !Pattern::any(); + /// assert!(!pattern.matches("")); + /// assert!(!pattern.matches("pants")); + /// ``` + fn not(self) -> Self::Output { + let cb = self.0; + Pattern(Box::new(move |value| !cb(value))) + } +} + +impl ::std::ops::BitAnd for Pattern { + type Output = Pattern; + + /// Combines two patterns into a pattern which matches a string iff both patterns match that string. + /// + /// # Example + /// ``` + /// use sanitize_html::rules::pattern::Pattern; + /// use regex::Regex; + /// + /// let pan = Pattern::regex(Regex::new("pan").unwrap()); + /// let ant = Pattern::regex(Regex::new("ant").unwrap()); + /// let pattern = pan & ant; + /// + /// assert!(!pattern.matches("pan")); + /// assert!(!pattern.matches("ant")); + /// assert!(pattern.matches("pants")); + /// ``` + fn bitand(self, rhs: Pattern) -> Self::Output { + let cb1 = self.0; + let cb2 = rhs.0; + Pattern(Box::new(move |value| cb1(value) && cb2(value))) + } +} + +impl ::std::ops::BitOr for Pattern { + type Output = Pattern; + + /// Combines two patterns into a pattern which matches a string if one of patterns matches that string. + /// + /// # Example + /// ``` + /// use sanitize_html::rules::pattern::Pattern; + /// use regex::Regex; + /// + /// let pan = Pattern::regex(Regex::new("pan").unwrap()); + /// let pot = Pattern::regex(Regex::new("pot").unwrap()); + /// let pattern = pan | pot; + /// + /// assert!(pattern.matches("pants")); + /// assert!(pattern.matches("pot")); + /// assert!(!pattern.matches("jar")); + /// ``` + fn bitor(self, rhs: Pattern) -> Self::Output { + let cb1 = self.0; + let cb2 = rhs.0; + Pattern(Box::new(move |value| cb1(value) || cb2(value))) + } +} diff --git a/sanitize-html-rs/src/rules/predefined.rs b/sanitize-html-rs/src/rules/predefined.rs new file mode 100644 index 0000000..6710ae3 --- /dev/null +++ b/sanitize-html-rs/src/rules/predefined.rs @@ -0,0 +1,374 @@ +//! Predefined rules +//! +//! These rules are inspired by a great Ruby gem [sanitize](https://github.com/rgrove/sanitize/). + +use super::pattern::Pattern; +use super::{Element, Rules}; +use lazy_static::lazy_static; +use regex::Regex; + +fn re(regex: &str) -> Pattern { + Pattern::regex(Regex::new(regex).unwrap()) +} + +fn href() -> Pattern { + re("^(ftp:|http:|https:|mailto:)") | !re("^[^/]+[[:space:]]*:") +} + +fn src() -> Pattern { + re("^(http:|https:)") | !re("^[^/]+[[:space:]]*:") +} + +lazy_static! { + /// Basic rules. Allows a variety of markup including formatting elements, links, and lists. + pub static ref BASIC: Rules = basic(); + + /// Default rules. Removes all tags. + pub static ref DEFAULT: Rules = default(); + + /// Relaxed rules. Allows an even wider variety of markup, including images and tables + pub static ref RELAXED: Rules = relaxed(); + + /// Restricted rules. Allows only very simple inline markup. No links, images, or block elements. + pub static ref RESTRICTED: Rules = restricted(); + + /// Rules for document from untrusted sources. Removes all tags but text emphasizing and links. + pub static ref UNTRUSTED: Rules = untrusted(); +} + +fn basic() -> Rules { + Rules::new() + .element(Element::new("a").attribute("href", href())) + .element(Element::new("abbr").attribute("title", Pattern::any())) + .element(Element::new("b")) + .element(Element::new("blockquote").attribute("cite", src())) + .element(Element::new("br")) + .element(Element::new("br")) + .element(Element::new("cite")) + .element(Element::new("code")) + .element(Element::new("dd")) + .element(Element::new("dfn").attribute("title", Pattern::any())) + .element(Element::new("dl")) + .element(Element::new("dt")) + .element(Element::new("em")) + .element(Element::new("i")) + .element(Element::new("kbd")) + .element(Element::new("li")) + .element(Element::new("mark")) + .element(Element::new("ol")) + .element(Element::new("p")) + .element(Element::new("pre")) + .element(Element::new("q").attribute("cite", src())) + .element(Element::new("s")) + .element(Element::new("samp")) + .element(Element::new("small")) + .element(Element::new("strike")) + .element(Element::new("strong")) + .element(Element::new("sub")) + .element(Element::new("sup")) + .element( + Element::new("time") + .attribute("datetime", Pattern::any()) + .attribute("pubdate", Pattern::any()), + ) + .element(Element::new("u")) + .element(Element::new("ul")) + .element(Element::new("var")) + .space("address") + .space("article") + .space("aside") + .space("div") + .space("footer") + .space("h1") + .space("h2") + .space("h3") + .space("h4") + .space("h5") + .space("h6") + .space("header") + .space("hgroup") + .space("hr") + .space("nav") + .space("section") +} + +fn default() -> Rules { + Rules::new() + .space("address") + .space("article") + .space("aside") + .space("blockquote") + .space("br") + .space("dd") + .space("div") + .space("dl") + .space("dt") + .space("footer") + .space("h1") + .space("h2") + .space("h3") + .space("h4") + .space("h5") + .space("h6") + .space("header") + .space("hgroup") + .space("hr") + .space("li") + .space("nav") + .space("ol") + .space("p") + .space("pre") + .space("section") + .space("ul") + .delete("iframe") + .delete("noembed") + .delete("noframes") + .delete("noscript") + .delete("script") + .delete("style") +} + +fn relaxed() -> Rules { + fn relaxed_element(name: &str) -> Element { + Element::new(name) + .attribute("dir", Pattern::any()) + .attribute("lang", Pattern::any()) + .attribute("title", Pattern::any()) + .attribute("class", Pattern::any()) + } + + Rules::new() + .element(relaxed_element("a").attribute("href", href())) + .element(relaxed_element("abbr")) + .element(relaxed_element("b")) + .element(relaxed_element("bdo")) + .element(relaxed_element("blockquote").attribute("cite", src())) + .element(relaxed_element("br")) + .element(relaxed_element("caption")) + .element(relaxed_element("cite")) + .element(relaxed_element("code")) + .element( + relaxed_element("col") + .attribute("span", Pattern::any()) + .attribute("width", Pattern::any()), + ) + .element( + relaxed_element("colgroup") + .attribute("span", Pattern::any()) + .attribute("width", Pattern::any()), + ) + .element(relaxed_element("dd")) + .element( + relaxed_element("del") + .attribute("cite", src()) + .attribute("datetime", Pattern::any()), + ) + .element(relaxed_element("dfn")) + .element(relaxed_element("dl")) + .element(relaxed_element("dt")) + .element(relaxed_element("em")) + .element(relaxed_element("figcaption")) + .element(relaxed_element("figure")) + .element(relaxed_element("h1")) + .element(relaxed_element("h2")) + .element(relaxed_element("h3")) + .element(relaxed_element("h4")) + .element(relaxed_element("h5")) + .element(relaxed_element("h6")) + .element(relaxed_element("hgroup")) + .element(relaxed_element("i")) + .element( + relaxed_element("img") + .attribute("src", src()) + .attribute("align", Pattern::any()) + .attribute("alt", Pattern::any()) + .attribute("width", Pattern::any()) + .attribute("height", Pattern::any()), + ) + .element( + relaxed_element("ins") + .attribute("cite", src()) + .attribute("datetime", Pattern::any()), + ) + .element(relaxed_element("kbd")) + .element(relaxed_element("li")) + .element(relaxed_element("mark")) + .element( + relaxed_element("ol") + .attribute("start", Pattern::any()) + .attribute("reversed", Pattern::any()) + .attribute("type", Pattern::any()), + ) + .element(relaxed_element("p")) + .element(relaxed_element("pre")) + .element(relaxed_element("q").attribute("cite", src())) + .element(relaxed_element("rp")) + .element(relaxed_element("rt")) + .element(relaxed_element("ruby")) + .element(relaxed_element("s")) + .element(relaxed_element("samp")) + .element(relaxed_element("small")) + .element(relaxed_element("strike")) + .element(relaxed_element("strong")) + .element(relaxed_element("sub")) + .element(relaxed_element("sup")) + .element( + relaxed_element("table") + .attribute("summary", Pattern::any()) + .attribute("width", Pattern::any()), + ) + .element(relaxed_element("tbody")) + .element( + relaxed_element("td") + .attribute("abbr", Pattern::any()) + .attribute("axis", Pattern::any()) + .attribute("colspan", Pattern::any()) + .attribute("rowspan", Pattern::any()) + .attribute("width", Pattern::any()), + ) + .element(relaxed_element("tfoot")) + .element( + relaxed_element("th") + .attribute("abbr", Pattern::any()) + .attribute("axis", Pattern::any()) + .attribute("colspan", Pattern::any()) + .attribute("rowspan", Pattern::any()) + .attribute("scope", Pattern::any()) + .attribute("width", Pattern::any()), + ) + .element(relaxed_element("thead")) + .element( + relaxed_element("time") + .attribute("datetime", Pattern::any()) + .attribute("pubdate", Pattern::any()), + ) + .element(relaxed_element("tr")) + .element(relaxed_element("u")) + .element(relaxed_element("ul").attribute("type", Pattern::any())) + .element(relaxed_element("var")) + .element(relaxed_element("wbr")) + .space("address") + .space("article") + .space("aside") + .space("footer") + .space("header") + .space("hr") + .space("nav") + .space("section") +} + +fn restricted() -> Rules { + Rules::new() + .element(Element::new("b")) + .element(Element::new("em")) + .element(Element::new("i")) + .element(Element::new("strong")) + .element(Element::new("u")) + .space("address") + .space("article") + .space("aside") + .space("blockquote") + .space("br") + .space("dd") + .space("div") + .space("dl") + .space("dt") + .space("footer") + .space("h1") + .space("h2") + .space("h3") + .space("h4") + .space("h5") + .space("h6") + .space("header") + .space("hgroup") + .space("hr") + .space("li") + .space("nav") + .space("ol") + .space("p") + .space("pre") + .space("section") + .space("ul") +} + +fn untrusted() -> Rules { + Rules::new() + .element( + Element::new("a") + .attribute("href", href()) + .mandatory_attribute("target", "_blank") + .mandatory_attribute("rel", "noreferrer noopener"), + ) + .element(Element::new("b")) + .element(Element::new("em")) + .element(Element::new("i")) + .element(Element::new("strong")) + .element(Element::new("u")) + .space("address") + .space("article") + .space("aside") + .space("blockquote") + .space("br") + .space("dd") + .space("div") + .space("dl") + .space("dt") + .space("footer") + .space("h1") + .space("h2") + .space("h3") + .space("h4") + .space("h5") + .space("h6") + .space("header") + .space("hgroup") + .space("hr") + .space("li") + .space("nav") + .space("ol") + .space("p") + .space("pre") + .space("section") + .space("ul") +} + +#[cfg(test)] +mod tests { + use super::{basic, default, relaxed, restricted, untrusted}; + + #[test] + fn basic_does_not_fail() { + let rules = basic(); + assert_eq!(rules.allowed_elements.len(), 31); + } + + #[test] + fn default_does_not_fail() { + let rules = default(); + assert_eq!(rules.allowed_elements.len(), 0); + assert_eq!(rules.space_elements.len(), 26); + assert_eq!(rules.delete_elements.len(), 6); + } + + #[test] + fn relaxed_does_not_fail() { + let rules = relaxed(); + assert_eq!(rules.allowed_elements.len(), 58); + assert_eq!(rules.space_elements.len(), 8); + } + + #[test] + fn restricted_does_not_fail() { + let rules = restricted(); + assert_eq!(rules.allowed_elements.len(), 5); + assert_eq!(rules.space_elements.len(), 26); + } + + #[test] + fn untrusted_does_not_fail() { + let rules = untrusted(); + assert_eq!(rules.allowed_elements.len(), 6); + assert_eq!(rules.space_elements.len(), 26); + } +} diff --git a/sanitize-html-rs/src/sanitize.rs b/sanitize-html-rs/src/sanitize.rs new file mode 100644 index 0000000..dc2ee91 --- /dev/null +++ b/sanitize-html-rs/src/sanitize.rs @@ -0,0 +1,167 @@ +use crate::rules::{Element, Rules}; +use html5ever::{interface::QualName, namespace_url, ns, LocalName}; +use kuchiki::{Attribute, ElementData, ExpandedName, NodeData, NodeRef}; + +fn simple_qual_name(name: &str) -> QualName { + QualName::new(None, ns!(), LocalName::from(name)) +} + +fn qual_name_to_string(name: &QualName) -> String { + if name.ns == ns!(html) || name.ns.is_empty() { + name.local.to_lowercase() + } else { + format!("{}:{}", name.ns.to_lowercase(), name.local.to_lowercase()) + } +} + +fn expanded_name_to_string(name: &ExpandedName) -> String { + if name.ns == ns!(html) || name.ns.is_empty() { + name.local.to_lowercase() + } else { + format!("{}:{}", name.ns.to_lowercase(), name.local.to_lowercase()) + } +} + +fn simple_element( + name: QualName, + attrs: Vec<(ExpandedName, Attribute)>, + children: Vec, +) -> NodeRef { + let element = NodeRef::new_element(name, attrs); + for child in children { + child.detach(); + element.append(child); + } + element +} + +fn create_space_text() -> NodeRef { + NodeRef::new_text(" ") +} + +enum ElementAction<'t> { + Keep(&'t Element), + Delete, + Space, + Elide, + Rename(&'t str), +} + +fn element_action<'t>(element_name: &QualName, rules: &'t Rules) -> ElementAction<'t> { + let name = qual_name_to_string(element_name); + if name == "html" || name == "body" { + ElementAction::Elide + } else if let Some(element_sanitizer) = rules.allowed_elements.get(&name) { + ElementAction::Keep(element_sanitizer) + } else if rules.delete_elements.contains(&name) { + ElementAction::Delete + } else if rules.space_elements.contains(&name) { + ElementAction::Space + } else if let Some(rename_to) = rules.rename_elements.get(&name) { + ElementAction::Rename(rename_to) + } else { + ElementAction::Elide + } +} + +fn clean_nodes(nodes: impl IntoIterator, rules: &Rules) -> Vec { + let mut result = Vec::new(); + for node in nodes { + let subnodes = clean_node(&node, rules); + result.extend(subnodes); + } + result +} + +fn clean_node(node: &NodeRef, rules: &Rules) -> Vec { + match node.data() { + NodeData::Document(..) => vec![], + NodeData::DocumentFragment => vec![], // TODO: ?? + NodeData::Doctype(..) => vec![], + NodeData::ProcessingInstruction(..) => vec![], + + NodeData::Text(..) => vec![node.clone()], + + NodeData::Comment(..) => { + if rules.allow_comments { + vec![node.clone()] + } else { + vec![] + } + } + + NodeData::Element(ElementData { + ref name, + ref attributes, + .. + }) => { + match element_action(name, rules) { + ElementAction::Keep(element_sanitizer) => { + let mut new_attrs: Vec<(ExpandedName, Attribute)> = Vec::new(); + + /* whitelisted attributes */ + for (attr_name, attr_value) in attributes.borrow().map.iter() { + if element_sanitizer + .is_valid(&expanded_name_to_string(attr_name), &attr_value.value) + { + new_attrs.push((attr_name.clone(), attr_value.clone())); + } + } + + /* mandatory attributes */ + let mut mandatory_attributes: Vec<(&String, &String)> = + element_sanitizer.mandatory_attributes.iter().collect(); + mandatory_attributes.sort(); + for &(attr_name, attr_value) in mandatory_attributes.iter() { + new_attrs.push(( + ExpandedName::new(ns!(), LocalName::from(attr_name.as_str())), + Attribute { + prefix: None, + value: attr_value.into(), + }, + )); + } + + let children = clean_nodes(node.children(), rules); + let element = simple_element(name.clone(), new_attrs, children); + + vec![element] + } + + ElementAction::Delete => vec![], + + ElementAction::Elide => clean_nodes(node.children(), rules), + + ElementAction::Space => { + let mut nodes = clean_nodes(node.children(), rules); + if nodes.is_empty() { + nodes.push(create_space_text()); + } else { + nodes.insert(0, create_space_text()); + nodes.push(create_space_text()); + } + nodes + } + + ElementAction::Rename(rename_to) => { + let children = clean_nodes(node.children(), rules); + vec![simple_element( + simple_qual_name(rename_to), + Vec::new(), + children, + )] + } + } + } + } +} + +pub(crate) fn sanitize_dom(dom: &NodeRef, mode: &Rules) -> NodeRef { + let new_children = clean_nodes(dom.children(), mode); + let new_dom = NodeRef::new_document(); + for child in new_children { + child.detach(); + new_dom.append(child); + } + new_dom +} diff --git a/sanitize-html-rs/src/tests.rs b/sanitize-html-rs/src/tests.rs new file mode 100644 index 0000000..cd18700 --- /dev/null +++ b/sanitize-html-rs/src/tests.rs @@ -0,0 +1,645 @@ +#![cfg(test)] + +use super::rules::predefined::*; +use super::rules::{Element, Rules}; +use super::sanitize_str; + +#[test] +fn empty() { + assert_eq!(&sanitize_str(&BASIC, "").unwrap(), ""); + assert_eq!(&sanitize_str(&DEFAULT, "").unwrap(), ""); + assert_eq!(&sanitize_str(&RELAXED, "").unwrap(), ""); + assert_eq!(&sanitize_str(&RESTRICTED, "").unwrap(), ""); + assert_eq!(&sanitize_str(&UNTRUSTED, "").unwrap(), ""); +} + +/* basic */ + +const BASIC_HTML: &str = "Lorem ipsum dolor sit
amet "; + +#[test] +fn basic_default() { + assert_eq!( + &sanitize_str(&DEFAULT, BASIC_HTML).unwrap(), + "Lorem ipsum dolor sit amet " + ); +} + +#[test] +fn basic_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, BASIC_HTML).unwrap(), + "Lorem ipsum dolor sit amet alert(\"hello world\");" + ); +} + +#[test] +fn basic_basic() { + assert_eq!( + &sanitize_str(&BASIC, BASIC_HTML).unwrap(), + "Lorem ipsum dolor sit
amet alert(\"hello world\");" + ); +} + +#[test] +fn basic_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, BASIC_HTML).unwrap(), + "Lorem ipsum dolor sit
amet alert(\"hello world\");" + ); +} + +/* malformed */ + +const MALFORMED_HTML: &str = "Lorem ipsum dolor sit
amet "; + +#[test] +fn malicious_default() { + assert_eq!( + &sanitize_str(&DEFAULT, MALICIOUS_HTML).unwrap(), + "Lorem ipsum dolor sit amet <script>alert(\"hello world\");" + ); +} + +#[test] +fn malicious_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, MALICIOUS_HTML).unwrap(), + "Lorem ipsum dolor sit amet <script>alert(\"hello world\");" + ); +} + +#[test] +fn malicious_basic() { + assert_eq!( + &sanitize_str(&BASIC, MALICIOUS_HTML).unwrap(), + "Lorem ipsum dolor sit
amet <script>alert(\"hello world\");" + ); +} + +#[test] +fn malicious_untrusted() { + assert_eq!( + &sanitize_str(&UNTRUSTED, MALICIOUS_HTML).unwrap(), + "Lorem ipsum dolor sit amet <script>alert(\"hello world\");" + ); +} + +#[test] +fn malicious_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, MALICIOUS_HTML).unwrap(), + "Lorem ipsum dolor sit
amet <script>alert(\"hello world\");" + ); +} + +/* raw-comment */ + +const RAW_COMMENT_HTML: &str = "Hello"; + +#[test] +fn raw_comment_default() { + assert_eq!(&sanitize_str(&DEFAULT, RAW_COMMENT_HTML).unwrap(), "Hello"); +} + +#[test] +fn raw_comment_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, RAW_COMMENT_HTML).unwrap(), + "Hello" + ); +} + +#[test] +fn raw_comment_basic() { + assert_eq!(&sanitize_str(&BASIC, RAW_COMMENT_HTML).unwrap(), "Hello"); +} + +#[test] +fn raw_comment_relaxed() { + assert_eq!(&sanitize_str(&RELAXED, RAW_COMMENT_HTML).unwrap(), "Hello"); +} + +/* protocol-based JS injection: simple, no spaces */ + +const JS_INJECTION_HTML_1: &str = "foo"; + +#[test] +fn js_injection_1_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_1).unwrap(), "foo"); +} + +#[test] +fn js_injection_1_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_1).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_1_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_1).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_1_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_1).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: simple, spaces before */ + +const JS_INJECTION_HTML_2: &str = "foo"; + +#[test] +fn js_injection_2_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_2).unwrap(), "foo"); +} + +#[test] +fn js_injection_2_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_2).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_2_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_2).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_2_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_2).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: simple, spaces after */ + +const JS_INJECTION_HTML_3: &str = "foo"; + +#[test] +fn js_injection_3_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_3).unwrap(), "foo"); +} + +#[test] +fn js_injection_3_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_3).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_3_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_3).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_3_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_3).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: simple, spaces before and after */ + +const JS_INJECTION_HTML_4: &str = "foo"; + +#[test] +fn js_injection_4_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_4).unwrap(), "foo"); +} + +#[test] +fn js_injection_4_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_4).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_4_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_4).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_4_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_4).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: preceding colon */ + +const JS_INJECTION_HTML_5: &str = "foo"; + +#[test] +fn js_injection_5_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_5).unwrap(), "foo"); +} + +#[test] +fn js_injection_5_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_5).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_5_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_5).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_5_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_5).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: UTF-8 encoding */ + +const JS_INJECTION_HTML_6: &str = "foo"; + +#[test] +fn js_injection_6_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_6).unwrap(), "foo"); +} + +#[test] +fn js_injection_6_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_6).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_6_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_6).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_6_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_6).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: long UTF-8 encoding */ + +const JS_INJECTION_HTML_7: &str = "foo"; + +#[test] +fn js_injection_7_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_7).unwrap(), "foo"); +} + +#[test] +fn js_injection_7_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_7).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_7_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_7).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_7_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_7).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: long UTF-8 encoding without semicolons */ + +const JS_INJECTION_HTML_8: &str = "foo"; + +#[test] +fn js_injection_8_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_8).unwrap(), "foo"); +} + +#[test] +fn js_injection_8_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_8).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_8_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_8).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_8_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_8).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: hex encoding */ + +const JS_INJECTION_HTML_9: &str = "foo"; + +#[test] +fn js_injection_9_default() { + assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_9).unwrap(), "foo"); +} + +#[test] +fn js_injection_9_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_9).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_9_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_9).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_9_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_9).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: long hex encoding */ + +const JS_INJECTION_HTML_10: &str = "foo"; + +#[test] +fn js_injection_10_default() { + assert_eq!( + &sanitize_str(&DEFAULT, JS_INJECTION_HTML_10).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_10_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_10).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_10_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_10).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_10_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_10).unwrap(), + "foo" + ); +} + +/* protocol-based JS injection: hex encoding without semicolons */ + +const JS_INJECTION_HTML_11: &str = "foo"; + +#[test] +fn js_injection_11_default() { + assert_eq!( + &sanitize_str(&DEFAULT, JS_INJECTION_HTML_11).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_11_restricted() { + assert_eq!( + &sanitize_str(&RESTRICTED, JS_INJECTION_HTML_11).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_11_basic() { + assert_eq!( + &sanitize_str(&BASIC, JS_INJECTION_HTML_11).unwrap(), + "foo" + ); +} + +#[test] +fn js_injection_11_relaxed() { + assert_eq!( + &sanitize_str(&RELAXED, JS_INJECTION_HTML_11).unwrap(), + "foo" + ); +} + +/* should translate valid HTML entities */ + +#[test] +fn misc_1() { + assert_eq!( + &sanitize_str(&DEFAULT, "Don't tasé me & bro!").unwrap(), + "Don't tasé me & bro!" + ); +} + +/* should translate valid HTML entities while encoding unencoded ampersands */ + +#[test] +fn misc_2() { + assert_eq!( + &sanitize_str(&DEFAULT, "cookies² & ¼ créme").unwrap(), + "cookies² & ¼ créme" + ); +} + +/* should never output ' */ + +#[test] +fn misc_3() { + assert_eq!( + &sanitize_str( + &DEFAULT, + "IE6 isn't a real browser" + ) + .unwrap(), + "IE6 isn't a real browser" + ); +} + +/* should not choke on several instances of the same element in a row */ + +#[test] +fn misc_4() { + assert_eq!( + &sanitize_str(&DEFAULT, "").unwrap(), + "" + ); +} + +/* should surround the contents of :whitespace_elements with space characters when removing the element */ + +#[test] +fn misc_5() { + assert_eq!( + &sanitize_str(&DEFAULT, "foo
bar
baz").unwrap(), + "foo bar baz" + ); +} + +#[test] +fn misc_6() { + assert_eq!( + &sanitize_str(&DEFAULT, "foo
bar
baz").unwrap(), + "foo bar baz" + ); +} + +#[test] +fn misc_7() { + assert_eq!( + &sanitize_str(&DEFAULT, "foo
bar
baz").unwrap(), + "foo bar baz" + ); +} + +#[test] +fn custom_rules() { + let rules = Rules::new() + .allow_comments(true) + .element(Element::new("b")) + .element(Element::new("span")) + .delete("script") + .delete("style") + .space("br") + .rename("strong", "span"); + + let html = "Lorem ipsum dolor sit
amet "; + + assert_eq!( + &sanitize_str(&rules, html).unwrap(), + "Lorem ipsum dolor sit amet " + ); +}