diff --git a/Cargo.lock b/Cargo.lock index de99dc6..4d376bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,20 +258,6 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -[[package]] -name = "html5ever" -version = "0.25.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" -dependencies = [ - "log", - "mac", - "markup5ever 0.10.1", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "html5ever" version = "0.26.0" @@ -280,7 +266,7 @@ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" dependencies = [ "log", "mac", - "markup5ever 0.11.0", + "markup5ever", "proc-macro2", "quote", "syn", @@ -349,25 +335,13 @@ name = "kuchiki" version = "0.8.1" dependencies = [ "cssparser", - "html5ever 0.26.0", + "html5ever", "indexmap", "matches", "selectors", "tempfile", ] -[[package]] -name = "kuchiki" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358" -dependencies = [ - "cssparser", - "html5ever 0.25.2", - "matches", - "selectors", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -428,20 +402,6 @@ dependencies = [ "serde", ] -[[package]] -name = "markup5ever" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" -dependencies = [ - "log", - "phf 0.8.0", - "phf_codegen 0.8.0", - "string_cache", - "string_cache_codegen", - "tendril", -] - [[package]] name = "markup5ever" version = "0.11.0" @@ -480,9 +440,9 @@ version = "0.1.0" dependencies = [ "base16ct", "imap", - "kuchiki 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", "mail-parser", "rustls-connector", + "sanitize_html", "sha2", ] @@ -853,8 +813,8 @@ checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695" name = "sanitize_html" version = "0.7.0" dependencies = [ - "html5ever 0.26.0", - "kuchiki 0.8.1", + "html5ever", + "kuchiki", "lazy_static", "regex", ] diff --git a/bin/Cargo.toml b/bin/Cargo.toml index ed72b05..84af6bb 100644 --- a/bin/Cargo.toml +++ b/bin/Cargo.toml @@ -9,7 +9,7 @@ description = "Converts email newsletters to static HTML files" [dependencies] base16ct = { version = "^0.1.0", features = [ "alloc" ] } imap = { version = "^2.4.1", default-features = false } -kuchiki = "^0.8.1" mail-parser = "^0.4.8" rustls-connector = { version = "^0.16.1", default-features = false, features = [ "webpki-roots-certs", "quic" ] } +sanitize_html = { path = "../sanitize-html-rs" } sha2 = "^0.10.2" diff --git a/bin/src/main.rs b/bin/src/main.rs index c48002e..cf4b35f 100644 --- a/bin/src/main.rs +++ b/bin/src/main.rs @@ -1,7 +1,7 @@ use std::{ collections::HashMap, error::Error, - fs::OpenOptions, + fs::{read_dir, OpenOptions}, io::Write, net::TcpStream, path::{Path, PathBuf}, @@ -11,12 +11,13 @@ use imap::Session; use mail_parser::Message; use rustls_connector::RustlsConnector; +use sanitize_html::{rules::Element, sanitize_str}; use sha2::{Digest, Sha256}; extern crate imap; -extern crate kuchiki; extern crate mail_parser; extern crate rustls_connector; +extern crate sanitize_html; extern crate sha2; fn main() { @@ -25,21 +26,24 @@ fn main() { std::fs::create_dir(&dir).expect("Could not create directory"); } - let messsages = connect("my.kiers.eu", 993, "newsletters@kie.rs", "Jjkcloudron1!") + let messsages = messages_from_tests(&Path::new("tests/data")) //connect("my.kiers.eu", 993, "newsletters@kie.rs", "Jjkcloudron1!") .expect("A list of messages"); for (uid, message) in messsages { - println!("Processing message {}\n", &uid); + println!("Processing message {}", &uid); let parsed = Message::parse(&message).expect("A parsed messsage."); + let title = parsed.get_subject().expect("Expected a subject"); + + println!("{}", &title); let html_body = parsed.get_html_body(0).expect("Could not read body"); - let html_bytes = html_body.as_bytes(); + + let processed_html = process_html(&html_body).expect("Could not process the HTML"); + let html_bytes = processed_html.as_bytes(); //println!("{}", &html_body); - let title = parsed.get_subject().expect("Expected a subject"); - let hash = base16ct::lower::encode_string(&Sha256::digest(&html_bytes)); println!("{}", hash); @@ -53,7 +57,6 @@ fn main() { // let new_html = dom.as_text() - let path: PathBuf = [dir, Path::new(&format!("{}.html", &title))] .iter() .collect(); @@ -65,6 +68,8 @@ fn main() { file.write_all(&html_bytes) .expect("Could not write to file."); + + println!(); } } @@ -73,7 +78,7 @@ fn connect( port: u16, username: &str, password: &str, -) -> Result>, Box> { +) -> Result>, Box> { let mut session = open_session(server, port, username, password)?; session.examine("INBOX")?; @@ -83,7 +88,7 @@ fn connect( Err(e) => return Err(Box::new(e)), }; - let mut msgs = HashMap::with_capacity(items.len()); + let mut msgs = HashMap::>::with_capacity(items.len()); //println!("# of messages: {}", &items.len()); @@ -98,7 +103,7 @@ fn connect( let body = message.body().expect("Message did not have a body."); - msgs.insert(item, body.to_owned()); + msgs.insert(item.to_string(), body.to_owned()); } session.logout().expect("Could not log out"); @@ -128,3 +133,65 @@ fn open_session( Ok(client.login(username, password).map_err(|e| e.0)?) } + +fn process_html(input: &str) -> Result { + let mut rules = sanitize_html::rules::predefined::relaxed().delete("style"); + + rules + .allowed_elements + .get_mut("img") + .unwrap() + .attribute_rules + .rename("src", "data-source"); + + let mut span = Element::new("span"); + + span.attribute_rules + .modify("style", Box::new(|_i| "".to_string())); + + let rules = rules.element(span); + + //rules.allowed_elements.remove_entry("img"); + + sanitize_str(&rules, input) + //Ok(input.to_owned()) +} + +fn messages_from_tests(path: &Path) -> Result>, Box> { + let mut messages = HashMap::>::new(); + + let items = match read_dir(path) { + Ok(i) => i, + Err(e) => return Err(Box::new(e)), + }; + + for item in items { + if let Ok(item) = item { + if let Some(extension) = item.path().extension() { + if extension != "eml" { + continue; + } + + let uid = item.path().file_stem().unwrap().to_owned(); + if let Ok(data) = std::fs::read(item.path()) { + messages.insert(uid.into_string().unwrap(), data); + } + } + } + } + + Ok(messages) +} + +fn write_to_test_path(uid: &str, message: &[u8]) { + let test_path: PathBuf = [Path::new("tests/data"), Path::new(&format!("{}.eml", &uid))] + .iter() + .collect(); + + let _ = OpenOptions::new() + .write(true) + .create(true) + .open(test_path) + .expect("Could not open file fir writing") + .write_all(&message); +} diff --git a/sanitize-html-rs/src/rules/mod.rs b/sanitize-html-rs/src/rules/mod.rs index 775382b..2fe60e2 100644 --- a/sanitize-html-rs/src/rules/mod.rs +++ b/sanitize-html-rs/src/rules/mod.rs @@ -16,6 +16,8 @@ pub struct Element { /// List of mandatory atributes and their values. /// These attributes will be forcibly added to element. pub mandatory_attributes: HashMap, + /// Attribute rules + pub attribute_rules: AttributeRules, } impl Element { @@ -25,6 +27,7 @@ impl Element { name: name.to_owned(), attributes: HashMap::new(), mandatory_attributes: HashMap::new(), + attribute_rules: AttributeRules::new(), } } @@ -102,3 +105,37 @@ impl Rules { self } } + +/// Structure to define rules for attributes +#[derive(Default)] +pub struct AttributeRules { + /// Atrributes which will be renamed. + pub rename_attributes: HashMap, + /// Functions to modify attribute contents + pub modify_attributes: HashMap String + Sync>>, +} + +impl AttributeRules { + /// Create a new attribute rules set. + pub fn new() -> Self { + Self::default() + } + + /// Adds a rule to rename an attribute + pub fn rename(&mut self, attribute_name: &str, to: &str) -> &Self { + self.rename_attributes + .insert(attribute_name.to_owned(), to.to_owned()); + self + } + + /// Adds a rule with a function to modify the contents of an attribute + pub fn modify( + &mut self, + attribute_name: &str, + function: Box String + Sync>, + ) -> &Self { + self.modify_attributes + .insert(attribute_name.to_owned(), function); + self + } +} diff --git a/sanitize-html-rs/src/rules/predefined.rs b/sanitize-html-rs/src/rules/predefined.rs index 6710ae3..12fbf72 100644 --- a/sanitize-html-rs/src/rules/predefined.rs +++ b/sanitize-html-rs/src/rules/predefined.rs @@ -36,7 +36,8 @@ lazy_static! { pub static ref UNTRUSTED: Rules = untrusted(); } -fn basic() -> Rules { +/// Basic rules. Allows a variety of markup including formatting elements, links, and lists. +pub fn basic() -> Rules { Rules::new() .element(Element::new("a").attribute("href", href())) .element(Element::new("abbr").attribute("title", Pattern::any())) @@ -90,9 +91,11 @@ fn basic() -> Rules { .space("hr") .space("nav") .space("section") + .delete("element_name") } -fn default() -> Rules { +/// Default rules. Removes all tags. +pub fn default() -> Rules { Rules::new() .space("address") .space("article") @@ -128,7 +131,8 @@ fn default() -> Rules { .delete("style") } -fn relaxed() -> Rules { +/// Relaxed rules. Allows an even wider variety of markup, including images and tables +pub fn relaxed() -> Rules { fn relaxed_element(name: &str) -> Element { Element::new(name) .attribute("dir", Pattern::any()) @@ -257,7 +261,8 @@ fn relaxed() -> Rules { .space("section") } -fn restricted() -> Rules { +/// Restricted rules. Allows only very simple inline markup. No links, images, or block elements. +pub fn restricted() -> Rules { Rules::new() .element(Element::new("b")) .element(Element::new("em")) @@ -292,7 +297,8 @@ fn restricted() -> Rules { .space("ul") } -fn untrusted() -> Rules { +/// Rules for document from untrusted sources. Removes all tags but text emphasizing and links. +pub fn untrusted() -> Rules { Rules::new() .element( Element::new("a") diff --git a/sanitize-html-rs/src/sanitize.rs b/sanitize-html-rs/src/sanitize.rs index dc2ee91..bb55795 100644 --- a/sanitize-html-rs/src/sanitize.rs +++ b/sanitize-html-rs/src/sanitize.rs @@ -101,11 +101,46 @@ fn clean_node(node: &NodeRef, rules: &Rules) -> Vec { /* whitelisted attributes */ for (attr_name, attr_value) in attributes.borrow().map.iter() { - if element_sanitizer - .is_valid(&expanded_name_to_string(attr_name), &attr_value.value) + let expanded_name = expanded_name_to_string(attr_name); + + let new_value = if !element_sanitizer.attribute_rules.modify_attributes.contains_key(&expanded_name) { + attr_value.clone() + } else { + let func = element_sanitizer.attribute_rules.modify_attributes.get(&expanded_name).unwrap(); + let new_value = func(attr_value.value.clone()); + Attribute { + prefix: attr_value.prefix.clone(), + value: new_value + } + }; + + if !element_sanitizer + .is_valid(&expanded_name_to_string(attr_name), &new_value.value) { - new_attrs.push((attr_name.clone(), attr_value.clone())); + continue; } + + let name = &attr_name.local.to_string(); + let new_name = if element_sanitizer + .attribute_rules + .rename_attributes + .contains_key(name) + { + ExpandedName::new( + attr_name.ns.clone(), + String::from( + element_sanitizer + .attribute_rules + .rename_attributes + .get(name) + .unwrap(), + ), + ) + } else { + attr_name.clone() + }; + + new_attrs.push((new_name, attr_value.clone())); } /* mandatory attributes */