newsletter-to-web/sanitize-html-rs/src/rules/predefined.rs

381 lines
12 KiB
Rust

//! Predefined rules
//!
//! These rules are inspired by a great Ruby gem [sanitize](https://github.com/rgrove/sanitize/).
use super::pattern::Pattern;
use super::{Element, Rules};
use lazy_static::lazy_static;
use regex::Regex;
fn re(regex: &str) -> Pattern {
Pattern::regex(Regex::new(regex).unwrap())
}
fn href() -> Pattern {
re("^(ftp:|http:|https:|mailto:)") | !re("^[^/]+[[:space:]]*:")
}
fn src() -> Pattern {
re("^(http:|https:)") | !re("^[^/]+[[:space:]]*:")
}
lazy_static! {
/// Basic rules. Allows a variety of markup including formatting elements, links, and lists.
pub static ref BASIC: Rules = basic();
/// Default rules. Removes all tags.
pub static ref DEFAULT: Rules = default();
/// Relaxed rules. Allows an even wider variety of markup, including images and tables
pub static ref RELAXED: Rules = relaxed();
/// Restricted rules. Allows only very simple inline markup. No links, images, or block elements.
pub static ref RESTRICTED: Rules = restricted();
/// Rules for document from untrusted sources. Removes all tags but text emphasizing and links.
pub static ref UNTRUSTED: Rules = untrusted();
}
/// Basic rules. Allows a variety of markup including formatting elements, links, and lists.
pub fn basic() -> Rules {
Rules::new()
.element(Element::new("a").attribute("href", href()))
.element(Element::new("abbr").attribute("title", Pattern::any()))
.element(Element::new("b"))
.element(Element::new("blockquote").attribute("cite", src()))
.element(Element::new("br"))
.element(Element::new("br"))
.element(Element::new("cite"))
.element(Element::new("code"))
.element(Element::new("dd"))
.element(Element::new("dfn").attribute("title", Pattern::any()))
.element(Element::new("dl"))
.element(Element::new("dt"))
.element(Element::new("em"))
.element(Element::new("i"))
.element(Element::new("kbd"))
.element(Element::new("li"))
.element(Element::new("mark"))
.element(Element::new("ol"))
.element(Element::new("p"))
.element(Element::new("pre"))
.element(Element::new("q").attribute("cite", src()))
.element(Element::new("s"))
.element(Element::new("samp"))
.element(Element::new("small"))
.element(Element::new("strike"))
.element(Element::new("strong"))
.element(Element::new("sub"))
.element(Element::new("sup"))
.element(
Element::new("time")
.attribute("datetime", Pattern::any())
.attribute("pubdate", Pattern::any()),
)
.element(Element::new("u"))
.element(Element::new("ul"))
.element(Element::new("var"))
.space("address")
.space("article")
.space("aside")
.space("div")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("nav")
.space("section")
.delete("element_name")
}
/// Default rules. Removes all tags.
pub fn default() -> Rules {
Rules::new()
.space("address")
.space("article")
.space("aside")
.space("blockquote")
.space("br")
.space("dd")
.space("div")
.space("dl")
.space("dt")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("li")
.space("nav")
.space("ol")
.space("p")
.space("pre")
.space("section")
.space("ul")
.delete("iframe")
.delete("noembed")
.delete("noframes")
.delete("noscript")
.delete("script")
.delete("style")
}
/// Relaxed rules. Allows an even wider variety of markup, including images and tables
pub fn relaxed() -> Rules {
fn relaxed_element(name: &str) -> Element {
Element::new(name)
.attribute("dir", Pattern::any())
.attribute("lang", Pattern::any())
.attribute("title", Pattern::any())
.attribute("class", Pattern::any())
}
Rules::new()
.element(relaxed_element("a").attribute("href", href()))
.element(relaxed_element("abbr"))
.element(relaxed_element("b"))
.element(relaxed_element("bdo"))
.element(relaxed_element("blockquote").attribute("cite", src()))
.element(relaxed_element("br"))
.element(relaxed_element("caption"))
.element(relaxed_element("cite"))
.element(relaxed_element("code"))
.element(
relaxed_element("col")
.attribute("span", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(
relaxed_element("colgroup")
.attribute("span", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("dd"))
.element(
relaxed_element("del")
.attribute("cite", src())
.attribute("datetime", Pattern::any()),
)
.element(relaxed_element("dfn"))
.element(relaxed_element("dl"))
.element(relaxed_element("dt"))
.element(relaxed_element("em"))
.element(relaxed_element("figcaption"))
.element(relaxed_element("figure"))
.element(relaxed_element("h1"))
.element(relaxed_element("h2"))
.element(relaxed_element("h3"))
.element(relaxed_element("h4"))
.element(relaxed_element("h5"))
.element(relaxed_element("h6"))
.element(relaxed_element("hgroup"))
.element(relaxed_element("i"))
.element(
relaxed_element("img")
.attribute("src", src())
.attribute("align", Pattern::any())
.attribute("alt", Pattern::any())
.attribute("width", Pattern::any())
.attribute("height", Pattern::any()),
)
.element(
relaxed_element("ins")
.attribute("cite", src())
.attribute("datetime", Pattern::any()),
)
.element(relaxed_element("kbd"))
.element(relaxed_element("li"))
.element(relaxed_element("mark"))
.element(
relaxed_element("ol")
.attribute("start", Pattern::any())
.attribute("reversed", Pattern::any())
.attribute("type", Pattern::any()),
)
.element(relaxed_element("p"))
.element(relaxed_element("pre"))
.element(relaxed_element("q").attribute("cite", src()))
.element(relaxed_element("rp"))
.element(relaxed_element("rt"))
.element(relaxed_element("ruby"))
.element(relaxed_element("s"))
.element(relaxed_element("samp"))
.element(relaxed_element("small"))
.element(relaxed_element("strike"))
.element(relaxed_element("strong"))
.element(relaxed_element("sub"))
.element(relaxed_element("sup"))
.element(
relaxed_element("table")
.attribute("summary", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("tbody"))
.element(
relaxed_element("td")
.attribute("abbr", Pattern::any())
.attribute("axis", Pattern::any())
.attribute("colspan", Pattern::any())
.attribute("rowspan", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("tfoot"))
.element(
relaxed_element("th")
.attribute("abbr", Pattern::any())
.attribute("axis", Pattern::any())
.attribute("colspan", Pattern::any())
.attribute("rowspan", Pattern::any())
.attribute("scope", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("thead"))
.element(
relaxed_element("time")
.attribute("datetime", Pattern::any())
.attribute("pubdate", Pattern::any()),
)
.element(relaxed_element("tr"))
.element(relaxed_element("u"))
.element(relaxed_element("ul").attribute("type", Pattern::any()))
.element(relaxed_element("var"))
.element(relaxed_element("wbr"))
.space("address")
.space("article")
.space("aside")
.space("footer")
.space("header")
.space("hr")
.space("nav")
.space("section")
}
/// Restricted rules. Allows only very simple inline markup. No links, images, or block elements.
pub fn restricted() -> Rules {
Rules::new()
.element(Element::new("b"))
.element(Element::new("em"))
.element(Element::new("i"))
.element(Element::new("strong"))
.element(Element::new("u"))
.space("address")
.space("article")
.space("aside")
.space("blockquote")
.space("br")
.space("dd")
.space("div")
.space("dl")
.space("dt")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("li")
.space("nav")
.space("ol")
.space("p")
.space("pre")
.space("section")
.space("ul")
}
/// Rules for document from untrusted sources. Removes all tags but text emphasizing and links.
pub fn untrusted() -> Rules {
Rules::new()
.element(
Element::new("a")
.attribute("href", href())
.mandatory_attribute("target", "_blank")
.mandatory_attribute("rel", "noreferrer noopener"),
)
.element(Element::new("b"))
.element(Element::new("em"))
.element(Element::new("i"))
.element(Element::new("strong"))
.element(Element::new("u"))
.space("address")
.space("article")
.space("aside")
.space("blockquote")
.space("br")
.space("dd")
.space("div")
.space("dl")
.space("dt")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("li")
.space("nav")
.space("ol")
.space("p")
.space("pre")
.space("section")
.space("ul")
}
#[cfg(test)]
mod tests {
use super::{basic, default, relaxed, restricted, untrusted};
#[test]
fn basic_does_not_fail() {
let rules = basic();
assert_eq!(rules.allowed_elements.len(), 31);
}
#[test]
fn default_does_not_fail() {
let rules = default();
assert_eq!(rules.allowed_elements.len(), 0);
assert_eq!(rules.space_elements.len(), 26);
assert_eq!(rules.delete_elements.len(), 6);
}
#[test]
fn relaxed_does_not_fail() {
let rules = relaxed();
assert_eq!(rules.allowed_elements.len(), 58);
assert_eq!(rules.space_elements.len(), 8);
}
#[test]
fn restricted_does_not_fail() {
let rules = restricted();
assert_eq!(rules.allowed_elements.len(), 5);
assert_eq!(rules.space_elements.len(), 26);
}
#[test]
fn untrusted_does_not_fail() {
let rules = untrusted();
assert_eq!(rules.allowed_elements.len(), 6);
assert_eq!(rules.space_elements.len(), 26);
}
}