Add email sanitization
Signed-off-by: Jacob Kiers <jacob@jacobkiers.net>
This commit is contained in:
parent
5962a4755b
commit
8f6f9c6e79
|
@ -258,20 +258,6 @@ version = "0.11.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.25.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever 0.10.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.26.0"
|
||||
|
@ -280,7 +266,7 @@ checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
|||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever 0.11.0",
|
||||
"markup5ever",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
|
@ -349,25 +335,13 @@ name = "kuchiki"
|
|||
version = "0.8.1"
|
||||
dependencies = [
|
||||
"cssparser",
|
||||
"html5ever 0.26.0",
|
||||
"html5ever",
|
||||
"indexmap",
|
||||
"matches",
|
||||
"selectors",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kuchiki"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ea8e9c6e031377cff82ee3001dc8026cdf431ed4e2e6b51f98ab8c73484a358"
|
||||
dependencies = [
|
||||
"cssparser",
|
||||
"html5ever 0.25.2",
|
||||
"matches",
|
||||
"selectors",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
|
@ -428,20 +402,6 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf 0.8.0",
|
||||
"phf_codegen 0.8.0",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.11.0"
|
||||
|
@ -480,9 +440,9 @@ version = "0.1.0"
|
|||
dependencies = [
|
||||
"base16ct",
|
||||
"imap",
|
||||
"kuchiki 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"mail-parser",
|
||||
"rustls-connector",
|
||||
"sanitize_html",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
|
@ -853,8 +813,8 @@ checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
|
|||
name = "sanitize_html"
|
||||
version = "0.7.0"
|
||||
dependencies = [
|
||||
"html5ever 0.26.0",
|
||||
"kuchiki 0.8.1",
|
||||
"html5ever",
|
||||
"kuchiki",
|
||||
"lazy_static",
|
||||
"regex",
|
||||
]
|
||||
|
|
|
@ -9,7 +9,7 @@ description = "Converts email newsletters to static HTML files"
|
|||
[dependencies]
|
||||
base16ct = { version = "^0.1.0", features = [ "alloc" ] }
|
||||
imap = { version = "^2.4.1", default-features = false }
|
||||
kuchiki = "^0.8.1"
|
||||
mail-parser = "^0.4.8"
|
||||
rustls-connector = { version = "^0.16.1", default-features = false, features = [ "webpki-roots-certs", "quic" ] }
|
||||
sanitize_html = { path = "../sanitize-html-rs" }
|
||||
sha2 = "^0.10.2"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use std::{
|
||||
collections::HashMap,
|
||||
error::Error,
|
||||
fs::OpenOptions,
|
||||
fs::{read_dir, OpenOptions},
|
||||
io::Write,
|
||||
net::TcpStream,
|
||||
path::{Path, PathBuf},
|
||||
|
@ -11,12 +11,13 @@ use imap::Session;
|
|||
use mail_parser::Message;
|
||||
use rustls_connector::RustlsConnector;
|
||||
|
||||
use sanitize_html::{rules::Element, sanitize_str};
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
extern crate imap;
|
||||
extern crate kuchiki;
|
||||
extern crate mail_parser;
|
||||
extern crate rustls_connector;
|
||||
extern crate sanitize_html;
|
||||
extern crate sha2;
|
||||
|
||||
fn main() {
|
||||
|
@ -25,21 +26,24 @@ fn main() {
|
|||
std::fs::create_dir(&dir).expect("Could not create directory");
|
||||
}
|
||||
|
||||
let messsages = connect("my.kiers.eu", 993, "newsletters@kie.rs", "Jjkcloudron1!")
|
||||
let messsages = messages_from_tests(&Path::new("tests/data")) //connect("my.kiers.eu", 993, "newsletters@kie.rs", "Jjkcloudron1!")
|
||||
.expect("A list of messages");
|
||||
|
||||
for (uid, message) in messsages {
|
||||
println!("Processing message {}\n", &uid);
|
||||
println!("Processing message {}", &uid);
|
||||
|
||||
let parsed = Message::parse(&message).expect("A parsed messsage.");
|
||||
let title = parsed.get_subject().expect("Expected a subject");
|
||||
|
||||
println!("{}", &title);
|
||||
|
||||
let html_body = parsed.get_html_body(0).expect("Could not read body");
|
||||
let html_bytes = html_body.as_bytes();
|
||||
|
||||
let processed_html = process_html(&html_body).expect("Could not process the HTML");
|
||||
let html_bytes = processed_html.as_bytes();
|
||||
|
||||
//println!("{}", &html_body);
|
||||
|
||||
let title = parsed.get_subject().expect("Expected a subject");
|
||||
|
||||
let hash = base16ct::lower::encode_string(&Sha256::digest(&html_bytes));
|
||||
println!("{}", hash);
|
||||
|
||||
|
@ -53,7 +57,6 @@ fn main() {
|
|||
|
||||
// let new_html = dom.as_text()
|
||||
|
||||
|
||||
let path: PathBuf = [dir, Path::new(&format!("{}.html", &title))]
|
||||
.iter()
|
||||
.collect();
|
||||
|
@ -65,6 +68,8 @@ fn main() {
|
|||
|
||||
file.write_all(&html_bytes)
|
||||
.expect("Could not write to file.");
|
||||
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -73,7 +78,7 @@ fn connect(
|
|||
port: u16,
|
||||
username: &str,
|
||||
password: &str,
|
||||
) -> Result<HashMap<u32, Vec<u8>>, Box<dyn Error>> {
|
||||
) -> Result<HashMap<String, Vec<u8>>, Box<dyn Error>> {
|
||||
let mut session = open_session(server, port, username, password)?;
|
||||
|
||||
session.examine("INBOX")?;
|
||||
|
@ -83,7 +88,7 @@ fn connect(
|
|||
Err(e) => return Err(Box::new(e)),
|
||||
};
|
||||
|
||||
let mut msgs = HashMap::with_capacity(items.len());
|
||||
let mut msgs = HashMap::<String, Vec<u8>>::with_capacity(items.len());
|
||||
|
||||
//println!("# of messages: {}", &items.len());
|
||||
|
||||
|
@ -98,7 +103,7 @@ fn connect(
|
|||
|
||||
let body = message.body().expect("Message did not have a body.");
|
||||
|
||||
msgs.insert(item, body.to_owned());
|
||||
msgs.insert(item.to_string(), body.to_owned());
|
||||
}
|
||||
|
||||
session.logout().expect("Could not log out");
|
||||
|
@ -128,3 +133,65 @@ fn open_session(
|
|||
|
||||
Ok(client.login(username, password).map_err(|e| e.0)?)
|
||||
}
|
||||
|
||||
fn process_html(input: &str) -> Result<String, sanitize_html::errors::SanitizeError> {
|
||||
let mut rules = sanitize_html::rules::predefined::relaxed().delete("style");
|
||||
|
||||
rules
|
||||
.allowed_elements
|
||||
.get_mut("img")
|
||||
.unwrap()
|
||||
.attribute_rules
|
||||
.rename("src", "data-source");
|
||||
|
||||
let mut span = Element::new("span");
|
||||
|
||||
span.attribute_rules
|
||||
.modify("style", Box::new(|_i| "".to_string()));
|
||||
|
||||
let rules = rules.element(span);
|
||||
|
||||
//rules.allowed_elements.remove_entry("img");
|
||||
|
||||
sanitize_str(&rules, input)
|
||||
//Ok(input.to_owned())
|
||||
}
|
||||
|
||||
fn messages_from_tests(path: &Path) -> Result<HashMap<String, Vec<u8>>, Box<dyn Error>> {
|
||||
let mut messages = HashMap::<String, Vec<u8>>::new();
|
||||
|
||||
let items = match read_dir(path) {
|
||||
Ok(i) => i,
|
||||
Err(e) => return Err(Box::new(e)),
|
||||
};
|
||||
|
||||
for item in items {
|
||||
if let Ok(item) = item {
|
||||
if let Some(extension) = item.path().extension() {
|
||||
if extension != "eml" {
|
||||
continue;
|
||||
}
|
||||
|
||||
let uid = item.path().file_stem().unwrap().to_owned();
|
||||
if let Ok(data) = std::fs::read(item.path()) {
|
||||
messages.insert(uid.into_string().unwrap(), data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(messages)
|
||||
}
|
||||
|
||||
fn write_to_test_path(uid: &str, message: &[u8]) {
|
||||
let test_path: PathBuf = [Path::new("tests/data"), Path::new(&format!("{}.eml", &uid))]
|
||||
.iter()
|
||||
.collect();
|
||||
|
||||
let _ = OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(test_path)
|
||||
.expect("Could not open file fir writing")
|
||||
.write_all(&message);
|
||||
}
|
||||
|
|
|
@ -16,6 +16,8 @@ pub struct Element {
|
|||
/// List of mandatory atributes and their values.
|
||||
/// These attributes will be forcibly added to element.
|
||||
pub mandatory_attributes: HashMap<String, String>,
|
||||
/// Attribute rules
|
||||
pub attribute_rules: AttributeRules,
|
||||
}
|
||||
|
||||
impl Element {
|
||||
|
@ -25,6 +27,7 @@ impl Element {
|
|||
name: name.to_owned(),
|
||||
attributes: HashMap::new(),
|
||||
mandatory_attributes: HashMap::new(),
|
||||
attribute_rules: AttributeRules::new(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,3 +105,37 @@ impl Rules {
|
|||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Structure to define rules for attributes
|
||||
#[derive(Default)]
|
||||
pub struct AttributeRules {
|
||||
/// Atrributes which will be renamed.
|
||||
pub rename_attributes: HashMap<String, String>,
|
||||
/// Functions to modify attribute contents
|
||||
pub modify_attributes: HashMap<String, Box<dyn Fn(String) -> String + Sync>>,
|
||||
}
|
||||
|
||||
impl AttributeRules {
|
||||
/// Create a new attribute rules set.
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Adds a rule to rename an attribute
|
||||
pub fn rename(&mut self, attribute_name: &str, to: &str) -> &Self {
|
||||
self.rename_attributes
|
||||
.insert(attribute_name.to_owned(), to.to_owned());
|
||||
self
|
||||
}
|
||||
|
||||
/// Adds a rule with a function to modify the contents of an attribute
|
||||
pub fn modify(
|
||||
&mut self,
|
||||
attribute_name: &str,
|
||||
function: Box<dyn Fn(String) -> String + Sync>,
|
||||
) -> &Self {
|
||||
self.modify_attributes
|
||||
.insert(attribute_name.to_owned(), function);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,7 +36,8 @@ lazy_static! {
|
|||
pub static ref UNTRUSTED: Rules = untrusted();
|
||||
}
|
||||
|
||||
fn basic() -> Rules {
|
||||
/// Basic rules. Allows a variety of markup including formatting elements, links, and lists.
|
||||
pub fn basic() -> Rules {
|
||||
Rules::new()
|
||||
.element(Element::new("a").attribute("href", href()))
|
||||
.element(Element::new("abbr").attribute("title", Pattern::any()))
|
||||
|
@ -90,9 +91,11 @@ fn basic() -> Rules {
|
|||
.space("hr")
|
||||
.space("nav")
|
||||
.space("section")
|
||||
.delete("element_name")
|
||||
}
|
||||
|
||||
fn default() -> Rules {
|
||||
/// Default rules. Removes all tags.
|
||||
pub fn default() -> Rules {
|
||||
Rules::new()
|
||||
.space("address")
|
||||
.space("article")
|
||||
|
@ -128,7 +131,8 @@ fn default() -> Rules {
|
|||
.delete("style")
|
||||
}
|
||||
|
||||
fn relaxed() -> Rules {
|
||||
/// Relaxed rules. Allows an even wider variety of markup, including images and tables
|
||||
pub fn relaxed() -> Rules {
|
||||
fn relaxed_element(name: &str) -> Element {
|
||||
Element::new(name)
|
||||
.attribute("dir", Pattern::any())
|
||||
|
@ -257,7 +261,8 @@ fn relaxed() -> Rules {
|
|||
.space("section")
|
||||
}
|
||||
|
||||
fn restricted() -> Rules {
|
||||
/// Restricted rules. Allows only very simple inline markup. No links, images, or block elements.
|
||||
pub fn restricted() -> Rules {
|
||||
Rules::new()
|
||||
.element(Element::new("b"))
|
||||
.element(Element::new("em"))
|
||||
|
@ -292,7 +297,8 @@ fn restricted() -> Rules {
|
|||
.space("ul")
|
||||
}
|
||||
|
||||
fn untrusted() -> Rules {
|
||||
/// Rules for document from untrusted sources. Removes all tags but text emphasizing and links.
|
||||
pub fn untrusted() -> Rules {
|
||||
Rules::new()
|
||||
.element(
|
||||
Element::new("a")
|
||||
|
|
|
@ -101,11 +101,46 @@ fn clean_node(node: &NodeRef, rules: &Rules) -> Vec<NodeRef> {
|
|||
|
||||
/* whitelisted attributes */
|
||||
for (attr_name, attr_value) in attributes.borrow().map.iter() {
|
||||
if element_sanitizer
|
||||
.is_valid(&expanded_name_to_string(attr_name), &attr_value.value)
|
||||
let expanded_name = expanded_name_to_string(attr_name);
|
||||
|
||||
let new_value = if !element_sanitizer.attribute_rules.modify_attributes.contains_key(&expanded_name) {
|
||||
attr_value.clone()
|
||||
} else {
|
||||
let func = element_sanitizer.attribute_rules.modify_attributes.get(&expanded_name).unwrap();
|
||||
let new_value = func(attr_value.value.clone());
|
||||
Attribute {
|
||||
prefix: attr_value.prefix.clone(),
|
||||
value: new_value
|
||||
}
|
||||
};
|
||||
|
||||
if !element_sanitizer
|
||||
.is_valid(&expanded_name_to_string(attr_name), &new_value.value)
|
||||
{
|
||||
new_attrs.push((attr_name.clone(), attr_value.clone()));
|
||||
continue;
|
||||
}
|
||||
|
||||
let name = &attr_name.local.to_string();
|
||||
let new_name = if element_sanitizer
|
||||
.attribute_rules
|
||||
.rename_attributes
|
||||
.contains_key(name)
|
||||
{
|
||||
ExpandedName::new(
|
||||
attr_name.ns.clone(),
|
||||
String::from(
|
||||
element_sanitizer
|
||||
.attribute_rules
|
||||
.rename_attributes
|
||||
.get(name)
|
||||
.unwrap(),
|
||||
),
|
||||
)
|
||||
} else {
|
||||
attr_name.clone()
|
||||
};
|
||||
|
||||
new_attrs.push((new_name, attr_value.clone()));
|
||||
}
|
||||
|
||||
/* mandatory attributes */
|
||||
|
|
Loading…
Reference in New Issue