Manually remove images instead of parsing

Instead of using all kinds of difficult libraries, just replace the
string "src" with "data-source". This covers most cases of removing
images.

This also removes the previously inlined kuchiki and sanitize-html-rs
libraries.

Signed-off-by: Jacob Kiers <jacob@jacobkiers.net>
This commit is contained in:
Jacob Kiers 2022-08-02 23:11:55 +02:00
parent 9d41fcd463
commit abf4c787ab
40 changed files with 3 additions and 4822 deletions

648
Cargo.lock generated
View File

@ -62,12 +62,6 @@ version = "3.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3"
[[package]]
name = "byteorder"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "cc"
version = "1.0.73"
@ -93,12 +87,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "cpufeatures"
version = "0.2.2"
@ -118,46 +106,6 @@ dependencies = [
"typenum",
]
[[package]]
name = "cssparser"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"matches",
"phf 0.8.0",
"proc-macro2",
"quote",
"smallvec",
"syn",
]
[[package]]
name = "cssparser-macros"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "derive_more"
version = "0.99.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
dependencies = [
"convert_case",
"proc-macro2",
"quote",
"rustc_version",
"syn",
]
[[package]]
name = "digest"
version = "0.10.3"
@ -168,21 +116,6 @@ dependencies = [
"crypto-common",
]
[[package]]
name = "dtoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0"
[[package]]
name = "dtoa-short"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bde03329ae10e79ede66c9ce4dc930aa8599043b0743008548680f25b91502d6"
dependencies = [
"dtoa",
]
[[package]]
name = "encoding_rs"
version = "0.8.31"
@ -192,34 +125,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "fastrand"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf"
dependencies = [
"instant",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generic-array"
version = "0.14.5"
@ -230,48 +135,6 @@ dependencies = [
"version_check",
]
[[package]]
name = "getrandom"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if",
"libc",
"wasi 0.9.0+wasi-snapshot-preview1",
]
[[package]]
name = "getrandom"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad"
dependencies = [
"cfg-if",
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
]
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "imap"
version = "2.4.1"
@ -296,31 +159,6 @@ dependencies = [
"nom",
]
[[package]]
name = "indexmap"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6012d540c5baa3589337a98ce73408de9b5a25ec9fc2c6fd6be8f0d39e0ca5a"
dependencies = [
"autocfg",
"hashbrown",
]
[[package]]
name = "instant"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
dependencies = [
"cfg-if",
]
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "js-sys"
version = "0.3.57"
@ -330,18 +168,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "kuchiki"
version = "0.8.1"
dependencies = [
"cssparser",
"html5ever",
"indexmap",
"matches",
"selectors",
"tempfile",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
@ -367,16 +193,6 @@ version = "0.2.126"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
[[package]]
name = "lock_api"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "327fa5b6a6940e4699ec49a9beae1ea4845c6bab9314e4f84ac68742139d8c53"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.17"
@ -386,12 +202,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "mail-parser"
version = "0.5.0"
@ -402,38 +212,12 @@ dependencies = [
"serde",
]
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen 0.10.0",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "matches"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f"
[[package]]
name = "memchr"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "newsletter-to-web"
version = "0.1.0"
@ -442,16 +226,9 @@ dependencies = [
"imap",
"mail-parser",
"rustls-connector",
"sanitize_html",
"sha2",
]
[[package]]
name = "nodrop"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "nom"
version = "5.1.2"
@ -488,139 +265,6 @@ version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7709cef83f0c1f58f666e746a08b21e0085f7440fa6a29cc194d68aac97a4225"
[[package]]
name = "parking_lot"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09a279cbf25cb0757810394fbc1e359949b59e348145c643a939a525692e6929"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-sys",
]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_macros",
"phf_shared 0.8.0",
"proc-macro-hack",
]
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator 0.8.0",
"phf_shared 0.8.0",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared 0.8.0",
"rand 0.7.3",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand 0.8.5",
]
[[package]]
name = "phf_macros"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
dependencies = [
"phf_generator 0.8.0",
"phf_shared 0.8.0",
"proc-macro-hack",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "ppv-lite86"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-hack"
version = "0.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro2"
version = "1.0.39"
@ -639,96 +283,6 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom 0.1.16",
"libc",
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc",
"rand_pcg",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha 0.3.1",
"rand_core 0.6.3",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core 0.5.1",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.3",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom 0.1.16",
]
[[package]]
name = "rand_core"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
dependencies = [
"getrandom 0.2.6",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "redox_syscall"
version = "0.2.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42"
dependencies = [
"bitflags",
]
[[package]]
name = "regex"
version = "1.5.6"
@ -746,15 +300,6 @@ version = "0.6.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64"
[[package]]
name = "remove_dir_all"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
dependencies = [
"winapi",
]
[[package]]
name = "ring"
version = "0.16.20"
@ -770,15 +315,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "rustc_version"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
dependencies = [
"semver",
]
[[package]]
name = "rustls"
version = "0.20.6"
@ -809,22 +345,6 @@ version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
[[package]]
name = "sanitize_html"
version = "0.7.0"
dependencies = [
"html5ever",
"kuchiki",
"lazy_static",
"regex",
]
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "sct"
version = "0.7.0"
@ -835,32 +355,6 @@ dependencies = [
"untrusted",
]
[[package]]
name = "selectors"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
dependencies = [
"bitflags",
"cssparser",
"derive_more",
"fxhash",
"log",
"matches",
"phf 0.8.0",
"phf_codegen 0.8.0",
"precomputed-hash",
"servo_arc",
"smallvec",
"thin-slice",
]
[[package]]
name = "semver"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a41d061efea015927ac527063765e73601444cdc344ba855bc7bd44578b25e1c"
[[package]]
name = "serde"
version = "1.0.137"
@ -881,16 +375,6 @@ dependencies = [
"syn",
]
[[package]]
name = "servo_arc"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
dependencies = [
"nodrop",
"stable_deref_trait",
]
[[package]]
name = "sha2"
version = "0.10.2"
@ -902,62 +386,18 @@ dependencies = [
"digest",
]
[[package]]
name = "siphasher"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]]
name = "smallvec"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
[[package]]
name = "spin"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "string_cache"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "213494b7a2b503146286049378ce02b482200519accc31872ee8be91fa820a08"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.96"
@ -969,37 +409,6 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
dependencies = [
"cfg-if",
"fastrand",
"libc",
"redox_syscall",
"remove_dir_all",
"winapi",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "thin-slice"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "time"
version = "0.1.44"
@ -1007,7 +416,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
dependencies = [
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"wasi",
"winapi",
]
@ -1029,24 +438,12 @@ version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "version_check"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
@ -1157,46 +554,3 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2"
dependencies = [
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47"
[[package]]
name = "windows_i686_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6"
[[package]]
name = "windows_i686_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024"
[[package]]
name = "windows_x86_64_gnu"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1"
[[package]]
name = "windows_x86_64_msvc"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680"

View File

@ -2,6 +2,4 @@
members = [
"bin",
"sanitize-html-rs",
"kuchiki",
]

View File

@ -11,5 +11,4 @@ base16ct = { version = "^0.1.0", features = [ "alloc" ] }
imap = { version = "^2.4.1", default-features = false }
mail-parser = "^0.5.0"
rustls-connector = { version = "^0.16.1", default-features = false, features = [ "webpki-roots-certs", "quic" ] }
sanitize_html = { path = "../sanitize-html-rs" }
sha2 = "^0.10.2"

View File

@ -8,13 +8,11 @@ use std::{
use mail_parser::Message as MpMessage;
use sanitize_html::{rules::Element, sanitize_str};
use sha2::{Digest, Sha256};
extern crate imap;
extern crate mail_parser;
extern crate rustls_connector;
extern crate sanitize_html;
extern crate sha2;
use message_reader::{EmailReader, TestMessagesReader};
@ -50,9 +48,6 @@ fn main() {
println!("Processing message {}", msg.get_uid());
let parsed = msg.get_parsed().expect("A parsed messsage.");
let title = parsed.get_subject().expect("Expected a subject");
println!("{}", &title);
let html_body = parsed.get_html_body(0).expect("Could not read html body");
let processed_html = process_html(&html_body).expect("Could not process the HTML");
@ -92,27 +87,8 @@ fn get_path(parsed: &MpMessage, msg: &Message) -> String {
format!("{:05}_{}_{}.html", uid, date_str, &hash).to_owned()
}
fn process_html(input: &str) -> Result<String, sanitize_html::errors::SanitizeError> {
let mut rules = sanitize_html::rules::predefined::relaxed().delete("style");
rules
.allowed_elements
.get_mut("img")
.unwrap()
.attribute_rules
.rename("src", "data-source");
let mut span = Element::new("span");
span.attribute_rules
.modify("style", Box::new(|_i| "".to_string()));
let rules = rules.element(span);
//rules.allowed_elements.remove_entry("img");
sanitize_str(&rules, input)
//Ok(input.to_owned())
fn process_html(input: &str) -> Result<String, ()> {
Ok(input.replace("src", "data-source"))
}
fn write_to_test_path(msg: &Message) {

3
kuchiki/.gitignore vendored
View File

@ -1,3 +0,0 @@
target
Cargo.lock
.cargo/config

View File

@ -1,6 +0,0 @@
sudo: false
language: rust
rust:
- nightly
- beta
- stable

View File

@ -1,22 +0,0 @@
[package]
name = "kuchiki"
version = "0.8.1"
authors = ["Simon Sapin <simon.sapin@exyr.org>"]
license = "MIT"
description = "(朽木) HTML/XML tree manipulation library"
repository = "https://github.com/kuchiki-rs/kuchiki"
edition = "2018"
[lib]
name = "kuchiki"
doctest = false
[dependencies]
cssparser = "^0.27"
matches = "^0.1.4"
html5ever = "^0.26"
selectors = "^0.22"
indexmap = "^1.6.0"
[dev-dependencies]
tempfile = "3"

View File

@ -1,23 +0,0 @@
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -1,10 +0,0 @@
Kuchiki (朽木)
==============
HTML/XML¹ tree manipulation library for Rust.
[Documentation](https://docs.rs/kuchiki/)
See [users.rust-lang.org discussion](http://users.rust-lang.org/t/kuchiki-a-vaporware-html-xml-tree-manipulation-library/435).
¹ There is no support for XML syntax yet. The plan is to integrate with an existing parser.

View File

View File

@ -1,3 +0,0 @@
<meta http-equiv="refresh" content="0; url=https://docs.rs/kuchiki/">
<link rel="canonical" href="https://docs.rs/kuchiki/">
<a href="https://docs.rs/kuchiki/">Moved to docs.rs</a>

View File

@ -1,3 +0,0 @@
<meta http-equiv="refresh" content="0; url=https://docs.rs/kuchiki/">
<link rel="canonical" href="https://docs.rs/kuchiki/">
<a href="https://docs.rs/kuchiki/">Moved to docs.rs</a>

View File

@ -1,48 +0,0 @@
extern crate kuchiki;
use kuchiki::traits::*;
fn main() {
let html = r"
<DOCTYPE html>
<html>
<head></head>
<body>
<h1>Example</h1>
<p class='foo'>Hello, world!</p>
<p class='foo'>I love HTML</p>
</body>
</html>
";
let css_selector = ".foo";
let document = kuchiki::parse_html().one(html);
for css_match in document.select(css_selector).unwrap() {
// css_match is a NodeDataRef, but most of the interesting methods are
// on NodeRef. Let's get the underlying NodeRef.
let as_node = css_match.as_node();
// In this example, as_node represents an HTML node like
//
// <p class='foo'>Hello world!</p>"
//
// Which is distinct from just 'Hello world!'. To get rid of that <p>
// tag, we're going to get each element's first child, which will be
// a "text" node.
//
// There are other kinds of nodes, of course. The possibilities are all
// listed in the `NodeData` enum in this crate.
let text_node = as_node.first_child().unwrap();
// Let's get the actual text in this text node. A text node wraps around
// a RefCell<String>, so we need to call borrow() to get a &str out.
let text = text_node.as_text().unwrap().borrow();
// Prints:
//
// "Hello, world!"
// "I love HTML"
println!("{:?}", text);
}
}

View File

@ -1,22 +0,0 @@
extern crate kuchiki;
fn main() {
let mut depth = 2;
// 20 M nodes is a few GB of memory.
while depth <= 20_000_000 {
let mut node = kuchiki::NodeRef::new_text("");
for _ in 0..depth {
let parent = kuchiki::NodeRef::new_text("");
parent.append(node);
node = parent;
}
println!("Trying to drop {} nodes...", depth);
// Without an explicit `impl Drop for Node`,
// depth = 20_000 causes "thread '<main>' has overflowed its stack"
// on my machine (Linux x86_64).
::std::mem::drop(node);
depth *= 10;
}
}

View File

@ -1,83 +0,0 @@
use html5ever::{LocalName, Namespace, Prefix};
use indexmap::{map::Entry, IndexMap};
/// Convenience wrapper around a indexmap that adds method for attributes in the null namespace.
#[derive(Debug, PartialEq, Clone)]
pub struct Attributes {
/// A map of attributes whose name can have namespaces.
pub map: IndexMap<ExpandedName, Attribute>,
}
/// <https://www.w3.org/TR/REC-xml-names/#dt-expname>
#[derive(Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord)]
pub struct ExpandedName {
/// Namespace URL
pub ns: Namespace,
/// "Local" part of the name
pub local: LocalName,
}
impl ExpandedName {
/// Trivial constructor
pub fn new<N: Into<Namespace>, L: Into<LocalName>>(ns: N, local: L) -> Self {
ExpandedName {
ns: ns.into(),
local: local.into(),
}
}
}
/// The non-identifying parts of an attribute
#[derive(Debug, PartialEq, Clone)]
pub struct Attribute {
/// The namespace prefix, if any
pub prefix: Option<Prefix>,
/// The attribute value
pub value: String,
}
impl Attributes {
/// Like IndexMap::contains
pub fn contains<A: Into<LocalName>>(&self, local_name: A) -> bool {
self.map.contains_key(&ExpandedName::new(ns!(), local_name))
}
/// Like IndexMap::get
pub fn get<A: Into<LocalName>>(&self, local_name: A) -> Option<&str> {
self.map
.get(&ExpandedName::new(ns!(), local_name))
.map(|attr| &*attr.value)
}
/// Like IndexMap::get_mut
pub fn get_mut<A: Into<LocalName>>(&mut self, local_name: A) -> Option<&mut String> {
self.map
.get_mut(&ExpandedName::new(ns!(), local_name))
.map(|attr| &mut attr.value)
}
/// Like IndexMap::entry
pub fn entry<A: Into<LocalName>>(&mut self, local_name: A) -> Entry<ExpandedName, Attribute> {
self.map.entry(ExpandedName::new(ns!(), local_name))
}
/// Like IndexMap::insert
pub fn insert<A: Into<LocalName>>(
&mut self,
local_name: A,
value: String,
) -> Option<Attribute> {
self.map.insert(
ExpandedName::new(ns!(), local_name),
Attribute {
prefix: None,
value,
},
)
}
/// Like IndexMap::remove
pub fn remove<A: Into<LocalName>>(&mut self, local_name: A) -> Option<Attribute> {
self.map.remove(&ExpandedName::new(ns!(), local_name))
}
}

View File

@ -1,113 +0,0 @@
//! Specialized methods for `Cell` of some specific `!Copy` types,
//! allowing limited access to a value without moving it of the cell.
//!
//!
//! # Soundness
//!
//! These methods use and `Cell::as_ptr` and `unsafe`.
//! Their soundness lies in that:
//!
//! * `Cell<T>: !Sync` for any `T`, so no other thread is accessing this cell.
//! * For the duration of the raw pointer access,
//! this thread only runs code that is known to not access the same cell again.
//! In particular, no method of a type paramater is called.
//! For example, `clone_inner` would be unsound to generalize to any `Cell<T>`
//! because it would involve running arbitrary code through `T::clone`
//! and provide that code with a reference to the inside of the cell.
//!
//! ```rust
//! struct Evil(Box<u32>, Rc<Cell<Option<Evil>>>);
//! impl Clone for Evil {
//! fn clone(&self) -> Self {
//! mem::drop(self.1.take()); // Mess with the "other" node, which might be `self`.
//! Evil(
//! self.0.clone(), // possible use after free!
//! Rc::new(Cell::new(None))
//! )
//! }
//! }
//! let a = Rc::new(Cell::new(None));
//! a.set(Some(Evil(Box::new(5), a.clone()))); // Make a reference cycle.
//! a.clone_inner();
//! ```
//!
//! `Rc<T>::clone` and `Weak<T>::clone` do not have this problem
//! as they only increment reference counts and never call `T::clone`.
//!
//!
//! # Alternative
//!
//! To avoid using `unsafe` entirely, operating on a `T: !Copy` value inside a `Cell<T>`
//! would require temporarily replacing it with a default value:
//!
//! ```rust
//! fn option_dance<T, F, R>(cell: &Cell<T>, f: F) -> R
//! where T: Default, F: FnOnce(&mut T) -> R
//! {
//! let mut value = cell.take();
//! let result = f(&mut value);
//! cell.set(value);
//! result
//! }
//! ```
//!
//! It would be worth exploring whether LLVM can reliably optimize away these extra moves
//! and compile the `Option` dance to assembly similar to that of the `unsafe` operation.
use std::cell::Cell;
use std::rc::{Rc, Weak};
pub trait CellOption {
fn is_none(&self) -> bool;
}
impl<T> CellOption for Cell<Option<T>> {
#[inline]
fn is_none(&self) -> bool {
unsafe { (*self.as_ptr()).is_none() }
}
}
pub trait CellOptionWeak<T> {
fn upgrade(&self) -> Option<Rc<T>>;
fn clone_inner(&self) -> Option<Weak<T>>;
}
impl<T> CellOptionWeak<T> for Cell<Option<Weak<T>>> {
#[inline]
fn upgrade(&self) -> Option<Rc<T>> {
unsafe { (*self.as_ptr()).as_ref().and_then(Weak::upgrade) }
}
#[inline]
fn clone_inner(&self) -> Option<Weak<T>> {
unsafe { (*self.as_ptr()).clone() }
}
}
pub trait CellOptionRc<T> {
/// Return `Some` if this `Rc` is the only strong reference count,
/// even if there are weak references.
fn take_if_unique_strong(&self) -> Option<Rc<T>>;
fn clone_inner(&self) -> Option<Rc<T>>;
}
impl<T> CellOptionRc<T> for Cell<Option<Rc<T>>> {
#[inline]
fn take_if_unique_strong(&self) -> Option<Rc<T>> {
unsafe {
match *self.as_ptr() {
None => None,
Some(ref rc) if Rc::strong_count(rc) > 1 => None,
// Not borrowing the `Rc<T>` here
// as we would be invalidating that borrow while it is outstanding:
Some(_) => self.take(),
}
}
}
#[inline]
fn clone_inner(&self) -> Option<Rc<T>> {
unsafe { (*self.as_ptr()).clone() }
}
}

View File

@ -1,452 +0,0 @@
//! Node iterators
use std::borrow::Borrow;
use std::cell::RefCell;
use std::iter::Rev;
use crate::node_data_ref::NodeDataRef;
use crate::select::Selectors;
use crate::tree::{ElementData, NodeRef};
impl NodeRef {
/// Return an iterator of references to this node and its ancestors.
#[inline]
pub fn inclusive_ancestors(&self) -> Ancestors {
Ancestors(Some(self.clone()))
}
/// Return an iterator of references to this nodes ancestors.
#[inline]
pub fn ancestors(&self) -> Ancestors {
Ancestors(self.parent())
}
/// Return an iterator of references to this node and the siblings before it.
#[inline]
pub fn inclusive_preceding_siblings(&self) -> Rev<Siblings> {
match self.parent() {
Some(parent) => {
let first_sibling = parent.first_child().unwrap();
debug_assert!(self.previous_sibling().is_some() || *self == first_sibling);
Siblings(Some(State {
next: first_sibling,
next_back: self.clone(),
}))
}
None => {
debug_assert!(self.previous_sibling().is_none());
Siblings(Some(State {
next: self.clone(),
next_back: self.clone(),
}))
}
}
.rev()
}
/// Return an iterator of references to this nodes siblings before it.
#[inline]
pub fn preceding_siblings(&self) -> Rev<Siblings> {
match (self.parent(), self.previous_sibling()) {
(Some(parent), Some(previous_sibling)) => {
let first_sibling = parent.first_child().unwrap();
Siblings(Some(State {
next: first_sibling,
next_back: previous_sibling,
}))
}
_ => Siblings(None),
}
.rev()
}
/// Return an iterator of references to this node and the siblings after it.
#[inline]
pub fn inclusive_following_siblings(&self) -> Siblings {
match self.parent() {
Some(parent) => {
let last_sibling = parent.last_child().unwrap();
debug_assert!(self.next_sibling().is_some() || *self == last_sibling);
Siblings(Some(State {
next: self.clone(),
next_back: last_sibling,
}))
}
None => {
debug_assert!(self.next_sibling().is_none());
Siblings(Some(State {
next: self.clone(),
next_back: self.clone(),
}))
}
}
}
/// Return an iterator of references to this nodes siblings after it.
#[inline]
pub fn following_siblings(&self) -> Siblings {
match (self.parent(), self.next_sibling()) {
(Some(parent), Some(next_sibling)) => {
let last_sibling = parent.last_child().unwrap();
Siblings(Some(State {
next: next_sibling,
next_back: last_sibling,
}))
}
_ => Siblings(None),
}
}
/// Return an iterator of references to this nodes children.
#[inline]
pub fn children(&self) -> Siblings {
match (self.first_child(), self.last_child()) {
(Some(first_child), Some(last_child)) => Siblings(Some(State {
next: first_child,
next_back: last_child,
})),
(None, None) => Siblings(None),
_ => unreachable!(),
}
}
/// Return an iterator of references to this node and its descendants, in tree order.
///
/// Parent nodes appear before the descendants.
///
/// Note: this is the `NodeEdge::Start` items from `traverse()`.
#[inline]
pub fn inclusive_descendants(&self) -> Descendants {
Descendants(self.traverse_inclusive())
}
/// Return an iterator of references to this nodes descendants, in tree order.
///
/// Parent nodes appear before the descendants.
///
/// Note: this is the `NodeEdge::Start` items from `traverse()`.
#[inline]
pub fn descendants(&self) -> Descendants {
Descendants(self.traverse())
}
/// Return an iterator of the start and end edges of this node and its descendants,
/// in tree order.
#[inline]
pub fn traverse_inclusive(&self) -> Traverse {
Traverse(Some(State {
next: NodeEdge::Start(self.clone()),
next_back: NodeEdge::End(self.clone()),
}))
}
/// Return an iterator of the start and end edges of this nodes descendants,
/// in tree order.
#[inline]
pub fn traverse(&self) -> Traverse {
match (self.first_child(), self.last_child()) {
(Some(first_child), Some(last_child)) => Traverse(Some(State {
next: NodeEdge::Start(first_child),
next_back: NodeEdge::End(last_child),
})),
(None, None) => Traverse(None),
_ => unreachable!(),
}
}
/// Return an iterator of the inclusive descendants element that match the given selector list.
#[inline]
pub fn select(&self, selectors: &str) -> Result<Select<Elements<Descendants>>, ()> {
self.inclusive_descendants().select(selectors)
}
/// Return the first inclusive descendants element that match the given selector list.
#[inline]
pub fn select_first(&self, selectors: &str) -> Result<NodeDataRef<ElementData>, ()> {
let mut elements = self.select(selectors)?;
elements.next().ok_or(())
}
}
#[derive(Debug, Clone)]
struct State<T> {
next: T,
next_back: T,
}
/// A double-ended iterator of sibling nodes.
#[derive(Debug, Clone)]
pub struct Siblings(Option<State<NodeRef>>);
macro_rules! siblings_next {
($next: ident, $next_back: ident, $next_sibling: ident) => {
fn $next(&mut self) -> Option<NodeRef> {
#![allow(non_shorthand_field_patterns)]
self.0.take().map(|State { $next: next, $next_back: next_back }| {
if let Some(sibling) = next.$next_sibling() {
if next != next_back {
self.0 = Some(State { $next: sibling, $next_back: next_back })
}
}
next
})
}
}
}
impl Iterator for Siblings {
type Item = NodeRef;
siblings_next!(next, next_back, next_sibling);
}
impl DoubleEndedIterator for Siblings {
siblings_next!(next_back, next, previous_sibling);
}
/// An iterator on ancestor nodes.
#[derive(Debug, Clone)]
pub struct Ancestors(Option<NodeRef>);
impl Iterator for Ancestors {
type Item = NodeRef;
#[inline]
fn next(&mut self) -> Option<NodeRef> {
self.0.take().map(|node| {
self.0 = node.parent();
node
})
}
}
/// An iterator of references to a given node and its descendants, in tree order.
#[derive(Debug, Clone)]
pub struct Descendants(Traverse);
macro_rules! descendants_next {
($next: ident) => {
#[inline]
fn $next(&mut self) -> Option<NodeRef> {
loop {
match (self.0).$next() {
Some(NodeEdge::Start(node)) => return Some(node),
Some(NodeEdge::End(_)) => {}
None => return None
}
}
}
}
}
impl Iterator for Descendants {
type Item = NodeRef;
descendants_next!(next);
}
impl DoubleEndedIterator for Descendants {
descendants_next!(next_back);
}
/// Marks either the start or the end of a node.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum NodeEdge<T> {
/// Indicates that start of a node that has children.
/// Yielded by `Traverse::next` before the nodes descendants.
/// In HTML or XML, this corresponds to an opening tag like `<div>`
Start(T),
/// Indicates that end of a node that has children.
/// Yielded by `Traverse::next` after the nodes descendants.
/// In HTML or XML, this corresponds to a closing tag like `</div>`
End(T),
}
/// An iterator of the start and end edges of the nodes in a given subtree.
#[derive(Debug, Clone)]
pub struct Traverse(Option<State<NodeEdge<NodeRef>>>);
macro_rules! traverse_next {
($next: ident, $next_back: ident, $first_child: ident, $next_sibling: ident, $Start: ident, $End: ident) => {
fn $next(&mut self) -> Option<NodeEdge<NodeRef>> {
#![allow(non_shorthand_field_patterns)]
self.0.take().map(|State { $next: next, $next_back: next_back }| {
if next != next_back {
self.0 = match next {
NodeEdge::$Start(ref node) => {
match node.$first_child() {
Some(child) => {
Some(State { $next: NodeEdge::$Start(child), $next_back: next_back })
}
None => Some(State { $next: NodeEdge::$End(node.clone()), $next_back: next_back })
}
}
NodeEdge::$End(ref node) => {
match node.$next_sibling() {
Some(sibling) => {
Some(State { $next: NodeEdge::$Start(sibling), $next_back: next_back })
}
None => node.parent().map(|parent| {
State { $next: NodeEdge::$End(parent), $next_back: next_back }
})
}
}
};
}
next
})
}
}
}
impl Iterator for Traverse {
type Item = NodeEdge<NodeRef>;
traverse_next!(next, next_back, first_child, next_sibling, Start, End);
}
impl DoubleEndedIterator for Traverse {
traverse_next!(next_back, next, last_child, previous_sibling, End, Start);
}
macro_rules! filter_map_like_iterator {
(#[$doc: meta] $name: ident: $f: expr, $from: ty => $to: ty) => {
#[$doc]
#[derive(Debug, Clone)]
pub struct $name<I>(pub I);
impl<I> Iterator for $name<I>
where
I: Iterator<Item = $from>,
{
type Item = $to;
#[inline]
fn next(&mut self) -> Option<$to> {
for x in self.0.by_ref() {
if let Some(y) = ($f)(x) {
return Some(y);
}
}
None
}
}
impl<I> DoubleEndedIterator for $name<I>
where
I: DoubleEndedIterator<Item = $from>,
{
#[inline]
fn next_back(&mut self) -> Option<$to> {
for x in self.0.by_ref().rev() {
if let Some(y) = ($f)(x) {
return Some(y);
}
}
None
}
}
};
}
filter_map_like_iterator! {
/// A node iterator adaptor that yields element nodes.
Elements: NodeRef::into_element_ref, NodeRef => NodeDataRef<ElementData>
}
filter_map_like_iterator! {
/// A node iterator adaptor that yields comment nodes.
Comments: NodeRef::into_comment_ref, NodeRef => NodeDataRef<RefCell<String>>
}
filter_map_like_iterator! {
/// A node iterator adaptor that yields text nodes.
TextNodes: NodeRef::into_text_ref, NodeRef => NodeDataRef<RefCell<String>>
}
/// An element iterator adaptor that yields elements maching given selectors.
pub struct Select<I, S = Selectors>
where
I: Iterator<Item = NodeDataRef<ElementData>>,
S: Borrow<Selectors>,
{
/// The underlying iterator.
pub iter: I,
/// The selectors to be matched.
pub selectors: S,
}
impl<I, S> Iterator for Select<I, S>
where
I: Iterator<Item = NodeDataRef<ElementData>>,
S: Borrow<Selectors>,
{
type Item = NodeDataRef<ElementData>;
#[inline]
fn next(&mut self) -> Option<NodeDataRef<ElementData>> {
for element in self.iter.by_ref() {
if self.selectors.borrow().matches(&element) {
return Some(element);
}
}
None
}
}
impl<I, S> DoubleEndedIterator for Select<I, S>
where
I: DoubleEndedIterator<Item = NodeDataRef<ElementData>>,
S: Borrow<Selectors>,
{
#[inline]
fn next_back(&mut self) -> Option<NodeDataRef<ElementData>> {
for element in self.iter.by_ref().rev() {
if self.selectors.borrow().matches(&element) {
return Some(element);
}
}
None
}
}
/// Convenience methods for node iterators.
pub trait NodeIterator: Sized + Iterator<Item = NodeRef> {
/// Filter this element iterator to elements.
#[inline]
fn elements(self) -> Elements<Self> {
Elements(self)
}
/// Filter this node iterator to text nodes.
#[inline]
fn text_nodes(self) -> TextNodes<Self> {
TextNodes(self)
}
/// Filter this node iterator to comment nodes.
#[inline]
fn comments(self) -> Comments<Self> {
Comments(self)
}
/// Filter this node iterator to elements maching the given selectors.
#[inline]
fn select(self, selectors: &str) -> Result<Select<Elements<Self>>, ()> {
self.elements().select(selectors)
}
}
/// Convenience methods for element iterators.
pub trait ElementIterator: Sized + Iterator<Item = NodeDataRef<ElementData>> {
/// Filter this element iterator to elements maching the given selectors.
#[inline]
fn select(self, selectors: &str) -> Result<Select<Self>, ()> {
Selectors::compile(selectors).map(|s| Select {
iter: self,
selectors: s,
})
}
}
impl<I> NodeIterator for I where I: Iterator<Item = NodeRef> {}
impl<I> ElementIterator for I where I: Iterator<Item = NodeDataRef<ElementData>> {}

View File

@ -1,40 +0,0 @@
/*!
Kuchiki (), a HTML/XML tree manipulation library for Rust.
*/
#![deny(missing_docs)]
#[macro_use]
extern crate html5ever;
#[macro_use]
extern crate matches;
mod attributes;
mod cell_extras;
pub mod iter;
mod node_data_ref;
mod parser;
mod select;
mod serializer;
#[cfg(test)]
mod tests;
mod tree;
pub use attributes::{Attribute, Attributes, ExpandedName};
pub use node_data_ref::NodeDataRef;
pub use parser::{parse_html, parse_html_with_options, parse_fragment, ParseOpts, Sink};
pub use select::{Selector, Selectors, Specificity};
pub use tree::{Doctype, DocumentData, ElementData, Node, NodeData, NodeRef};
/// This module re-exports a number of traits that are useful when using Kuchiki.
/// It can be used with:
///
/// ```rust
/// use kuchiki::traits::*;
/// ```
pub mod traits {
pub use html5ever::tendril::TendrilSink;
pub use crate::iter::{ElementIterator, NodeIterator};
}

View File

@ -1,116 +0,0 @@
use std::cell::RefCell;
use std::fmt;
use std::ops::Deref;
use crate::tree::{Doctype, DocumentData, ElementData, Node, NodeRef};
impl NodeRef {
/// If this node is an element, return a strong reference to element-specific data.
#[inline]
pub fn into_element_ref(self) -> Option<NodeDataRef<ElementData>> {
NodeDataRef::new_opt(self, Node::as_element)
}
/// If this node is a text node, return a strong reference to its contents.
#[inline]
pub fn into_text_ref(self) -> Option<NodeDataRef<RefCell<String>>> {
NodeDataRef::new_opt(self, Node::as_text)
}
/// If this node is a comment, return a strong reference to its contents.
#[inline]
pub fn into_comment_ref(self) -> Option<NodeDataRef<RefCell<String>>> {
NodeDataRef::new_opt(self, Node::as_comment)
}
/// If this node is a doctype, return a strong reference to doctype-specific data.
#[inline]
pub fn into_doctype_ref(self) -> Option<NodeDataRef<Doctype>> {
NodeDataRef::new_opt(self, Node::as_doctype)
}
/// If this node is a document, return a strong reference to document-specific data.
#[inline]
pub fn into_document_ref(self) -> Option<NodeDataRef<DocumentData>> {
NodeDataRef::new_opt(self, Node::as_document)
}
}
/// Holds a strong reference to a node, but dereferences to some component inside of it.
#[derive(Eq)]
pub struct NodeDataRef<T> {
_keep_alive: NodeRef,
_reference: *const T,
}
impl<T> NodeDataRef<T> {
/// Create a `NodeDataRef` for a component in a given node.
#[inline]
pub fn new<F>(rc: NodeRef, f: F) -> NodeDataRef<T>
where
F: FnOnce(&Node) -> &T,
{
NodeDataRef {
_reference: f(&*rc),
_keep_alive: rc,
}
}
/// Create a `NodeDataRef` for and a component that may or may not be in a given node.
#[inline]
pub fn new_opt<F>(rc: NodeRef, f: F) -> Option<NodeDataRef<T>>
where
F: FnOnce(&Node) -> Option<&T>,
{
f(&*rc).map(|r| r as *const T).map(move |r| NodeDataRef {
_reference: r,
_keep_alive: rc,
})
}
/// Access the corresponding node.
#[inline]
pub fn as_node(&self) -> &NodeRef {
&self._keep_alive
}
}
impl<T> Deref for NodeDataRef<T> {
type Target = T;
#[inline]
fn deref(&self) -> &T {
unsafe { &*self._reference }
}
}
// #[derive(PartialEq)] would compare both fields
impl<T> PartialEq for NodeDataRef<T> {
#[inline]
fn eq(&self, other: &Self) -> bool {
self._keep_alive == other._keep_alive
}
}
// #[derive(Clone)] would have an unnecessary `T: Clone` bound
impl<T> Clone for NodeDataRef<T> {
#[inline]
fn clone(&self) -> Self {
NodeDataRef {
_keep_alive: self._keep_alive.clone(),
_reference: self._reference,
}
}
}
impl<T: fmt::Debug> fmt::Debug for NodeDataRef<T> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
fmt::Debug::fmt(&**self, f)
}
}
impl NodeDataRef<ElementData> {
/// Return the concatenation of all text nodes in this subtree.
pub fn text_contents(&self) -> String {
self.as_node().text_contents()
}
}

View File

@ -1,241 +0,0 @@
use html5ever::tendril::StrTendril;
use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::{self, Attribute, ExpandedName, QualName};
use std::borrow::Cow;
use crate::attributes;
use crate::tree::NodeRef;
/// Options for the HTML parser.
#[derive(Default)]
pub struct ParseOpts {
/// Options for the HTML tokenizer.
pub tokenizer: html5ever::tokenizer::TokenizerOpts,
/// Options for the HTML tree builder.
pub tree_builder: html5ever::tree_builder::TreeBuilderOpts,
/// A callback for HTML parse errors (which are never fatal).
pub on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
}
/// Parse an HTML document with html5ever and the default configuration.
pub fn parse_html() -> html5ever::Parser<Sink> {
parse_html_with_options(ParseOpts::default())
}
/// Parse an HTML document with html5ever with custom configuration.
pub fn parse_html_with_options(opts: ParseOpts) -> html5ever::Parser<Sink> {
let sink = Sink {
document_node: NodeRef::new_document(),
on_parse_error: opts.on_parse_error,
};
let html5opts = html5ever::ParseOpts {
tokenizer: opts.tokenizer,
tree_builder: opts.tree_builder,
};
html5ever::parse_document(sink, html5opts)
}
/// Parse an HTML fragment with html5ever and the default configuration.
pub fn parse_fragment(ctx_name: QualName, ctx_attr: Vec<Attribute>) -> html5ever::Parser<Sink> {
parse_fragment_with_options(ParseOpts::default(), ctx_name, ctx_attr)
}
/// Parse an HTML fragment with html5ever with custom configuration.
pub fn parse_fragment_with_options(opts: ParseOpts, ctx_name: QualName, ctx_attr: Vec<Attribute>) -> html5ever::Parser<Sink> {
let sink = Sink {
document_node: NodeRef::new_document(),
on_parse_error: opts.on_parse_error,
};
let html5opts = html5ever::ParseOpts {
tokenizer: opts.tokenizer,
tree_builder: opts.tree_builder,
};
html5ever::parse_fragment(sink, html5opts, ctx_name, ctx_attr)
}
/// Receives new tree nodes during parsing.
pub struct Sink {
document_node: NodeRef,
on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
}
impl TreeSink for Sink {
type Output = NodeRef;
fn finish(self) -> NodeRef {
self.document_node
}
type Handle = NodeRef;
#[inline]
fn parse_error(&mut self, message: Cow<'static, str>) {
if let Some(ref mut handler) = self.on_parse_error {
handler(message)
}
}
#[inline]
fn get_document(&mut self) -> NodeRef {
self.document_node.clone()
}
#[inline]
fn set_quirks_mode(&mut self, mode: QuirksMode) {
self.document_node
.as_document()
.unwrap()
._quirks_mode
.set(mode)
}
#[inline]
fn same_node(&self, x: &NodeRef, y: &NodeRef) -> bool {
x == y
}
#[inline]
fn elem_name<'a>(&self, target: &'a NodeRef) -> ExpandedName<'a> {
target.as_element().unwrap().name.expanded()
}
#[inline]
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
_flags: ElementFlags,
) -> NodeRef {
NodeRef::new_element(
name,
attrs.into_iter().map(|attr| {
let Attribute {
name: QualName { prefix, ns, local },
value,
} = attr;
let value = String::from(value);
(
attributes::ExpandedName { ns, local },
attributes::Attribute { prefix, value },
)
}),
)
}
#[inline]
fn create_comment(&mut self, text: StrTendril) -> NodeRef {
NodeRef::new_comment(text)
}
#[inline]
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> NodeRef {
NodeRef::new_processing_instruction(target, data)
}
#[inline]
fn append(&mut self, parent: &NodeRef, child: NodeOrText<NodeRef>) {
match child {
NodeOrText::AppendNode(node) => parent.append(node),
NodeOrText::AppendText(text) => {
if let Some(last_child) = parent.last_child() {
if let Some(existing) = last_child.as_text() {
existing.borrow_mut().push_str(&text);
return;
}
}
parent.append(NodeRef::new_text(text))
}
}
}
#[inline]
fn append_before_sibling(&mut self, sibling: &NodeRef, child: NodeOrText<NodeRef>) {
match child {
NodeOrText::AppendNode(node) => sibling.insert_before(node),
NodeOrText::AppendText(text) => {
if let Some(previous_sibling) = sibling.previous_sibling() {
if let Some(existing) = previous_sibling.as_text() {
existing.borrow_mut().push_str(&text);
return;
}
}
sibling.insert_before(NodeRef::new_text(text))
}
}
}
#[inline]
fn append_doctype_to_document(
&mut self,
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
) {
self.document_node
.append(NodeRef::new_doctype(name, public_id, system_id))
}
#[inline]
fn add_attrs_if_missing(&mut self, target: &NodeRef, attrs: Vec<Attribute>) {
let element = target.as_element().unwrap();
let mut attributes = element.attributes.borrow_mut();
for Attribute {
name: QualName { prefix, ns, local },
value,
} in attrs
{
attributes
.map
.entry(attributes::ExpandedName { ns, local })
.or_insert_with(|| {
let value = String::from(value);
attributes::Attribute { prefix, value }
});
}
}
#[inline]
fn remove_from_parent(&mut self, target: &NodeRef) {
target.detach()
}
#[inline]
fn reparent_children(&mut self, node: &NodeRef, new_parent: &NodeRef) {
// FIXME: Can this be done more effciently in rctree,
// by moving the whole linked list of children at once?
for child in node.children() {
new_parent.append(child)
}
}
#[inline]
fn mark_script_already_started(&mut self, _node: &NodeRef) {
// FIXME: Is this useful outside of a browser?
}
#[inline]
fn get_template_contents(&mut self, target: &NodeRef) -> NodeRef {
target
.as_element()
.unwrap()
.template_contents
.clone()
.unwrap()
}
fn append_based_on_parent_node(
&mut self,
element: &NodeRef,
prev_element: &NodeRef,
child: NodeOrText<NodeRef>,
) {
if element.parent().is_some() {
self.append_before_sibling(element, child)
} else {
self.append(prev_element, child)
}
}
}

View File

@ -1,433 +0,0 @@
use crate::attributes::ExpandedName;
use cssparser::{self, CowRcStr, ParseError, SourceLocation, ToCss};
use html5ever::{LocalName, Namespace};
use crate::iter::{NodeIterator, Select};
use crate::node_data_ref::NodeDataRef;
use selectors::attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint};
use selectors::context::QuirksMode;
use selectors::parser::SelectorParseErrorKind;
use selectors::parser::{
NonTSPseudoClass, Parser, Selector as GenericSelector, SelectorImpl, SelectorList,
};
use selectors::{self, matching, OpaqueElement};
use std::fmt;
use crate::tree::{ElementData, Node, NodeData, NodeRef};
/// The definition of whitespace per CSS Selectors Level 3 § 4.
///
/// Copied from rust-selectors.
static SELECTOR_WHITESPACE: &[char] = &[' ', '\t', '\n', '\r', '\x0C'];
#[derive(Debug, Clone)]
pub struct KuchikiSelectors;
impl SelectorImpl for KuchikiSelectors {
type AttrValue = String;
type Identifier = LocalName;
type ClassName = LocalName;
type LocalName = LocalName;
type PartName = LocalName;
type NamespacePrefix = LocalName;
type NamespaceUrl = Namespace;
type BorrowedNamespaceUrl = Namespace;
type BorrowedLocalName = LocalName;
type NonTSPseudoClass = PseudoClass;
type PseudoElement = PseudoElement;
type ExtraMatchingData = ();
}
struct KuchikiParser;
impl<'i> Parser<'i> for KuchikiParser {
type Impl = KuchikiSelectors;
type Error = SelectorParseErrorKind<'i>;
fn parse_non_ts_pseudo_class(
&self,
location: SourceLocation,
name: CowRcStr<'i>,
) -> Result<PseudoClass, ParseError<'i, SelectorParseErrorKind<'i>>> {
use self::PseudoClass::*;
if name.eq_ignore_ascii_case("any-link") {
Ok(AnyLink)
} else if name.eq_ignore_ascii_case("link") {
Ok(Link)
} else if name.eq_ignore_ascii_case("visited") {
Ok(Visited)
} else if name.eq_ignore_ascii_case("active") {
Ok(Active)
} else if name.eq_ignore_ascii_case("focus") {
Ok(Focus)
} else if name.eq_ignore_ascii_case("hover") {
Ok(Hover)
} else if name.eq_ignore_ascii_case("enabled") {
Ok(Enabled)
} else if name.eq_ignore_ascii_case("disabled") {
Ok(Disabled)
} else if name.eq_ignore_ascii_case("checked") {
Ok(Checked)
} else if name.eq_ignore_ascii_case("indeterminate") {
Ok(Indeterminate)
} else {
Err(
location.new_custom_error(SelectorParseErrorKind::UnsupportedPseudoClassOrElement(
name,
)),
)
}
}
}
#[derive(PartialEq, Eq, Clone, Debug, Hash)]
pub enum PseudoClass {
AnyLink,
Link,
Visited,
Active,
Focus,
Hover,
Enabled,
Disabled,
Checked,
Indeterminate,
}
impl NonTSPseudoClass for PseudoClass {
type Impl = KuchikiSelectors;
fn is_active_or_hover(&self) -> bool {
matches!(*self, PseudoClass::Active | PseudoClass::Hover)
}
fn is_user_action_state(&self) -> bool {
matches!(*self, PseudoClass::Active | PseudoClass::Hover | PseudoClass::Focus)
}
fn has_zero_specificity(&self) -> bool {
false
}
}
impl ToCss for PseudoClass {
fn to_css<W>(&self, dest: &mut W) -> fmt::Result
where
W: fmt::Write,
{
dest.write_str(match *self {
PseudoClass::AnyLink => ":any-link",
PseudoClass::Link => ":link",
PseudoClass::Visited => ":visited",
PseudoClass::Active => ":active",
PseudoClass::Focus => ":focus",
PseudoClass::Hover => ":hover",
PseudoClass::Enabled => ":enabled",
PseudoClass::Disabled => ":disabled",
PseudoClass::Checked => ":checked",
PseudoClass::Indeterminate => ":indeterminate",
})
}
}
#[derive(PartialEq, Eq, Clone, Debug, Hash)]
pub enum PseudoElement {}
impl ToCss for PseudoElement {
fn to_css<W>(&self, _dest: &mut W) -> fmt::Result
where
W: fmt::Write,
{
match *self {}
}
}
impl selectors::parser::PseudoElement for PseudoElement {
type Impl = KuchikiSelectors;
}
impl selectors::Element for NodeDataRef<ElementData> {
type Impl = KuchikiSelectors;
#[inline]
fn opaque(&self) -> OpaqueElement {
let node: &Node = self.as_node();
OpaqueElement::new(node)
}
#[inline]
fn is_html_slot_element(&self) -> bool {
false
}
#[inline]
fn parent_node_is_shadow_root(&self) -> bool {
false
}
#[inline]
fn containing_shadow_host(&self) -> Option<Self> {
None
}
#[inline]
fn parent_element(&self) -> Option<Self> {
self.as_node().parent().and_then(NodeRef::into_element_ref)
}
#[inline]
fn prev_sibling_element(&self) -> Option<Self> {
self.as_node().preceding_siblings().elements().next()
}
#[inline]
fn next_sibling_element(&self) -> Option<Self> {
self.as_node().following_siblings().elements().next()
}
#[inline]
fn is_empty(&self) -> bool {
self.as_node().children().all(|child| match *child.data() {
NodeData::Element(_) => false,
NodeData::Text(ref text) => text.borrow().is_empty(),
_ => true,
})
}
#[inline]
fn is_root(&self) -> bool {
match self.as_node().parent() {
None => false,
Some(parent) => matches!(*parent.data(), NodeData::Document(_)),
}
}
#[inline]
fn is_html_element_in_html_document(&self) -> bool {
// FIXME: Have a notion of HTML document v.s. XML document?
self.name.ns == ns!(html)
}
#[inline]
fn has_local_name(&self, name: &LocalName) -> bool {
self.name.local == *name
}
#[inline]
fn has_namespace(&self, namespace: &Namespace) -> bool {
self.name.ns == *namespace
}
#[inline]
fn is_part(&self, _name: &LocalName) -> bool {
false
}
#[inline]
fn exported_part(&self, _: &LocalName) -> Option<LocalName> {
None
}
#[inline]
fn imported_part(&self, _: &LocalName) -> Option<LocalName> {
None
}
#[inline]
fn is_pseudo_element(&self) -> bool {
false
}
#[inline]
fn is_same_type(&self, other: &Self) -> bool {
self.name == other.name
}
#[inline]
fn is_link(&self) -> bool {
self.name.ns == ns!(html)
&& matches!(
self.name.local,
local_name!("a") | local_name!("area") | local_name!("link")
)
&& self
.attributes
.borrow()
.map
.contains_key(&ExpandedName::new(ns!(), local_name!("href")))
}
#[inline]
fn has_id(&self, id: &LocalName, case_sensitivity: CaseSensitivity) -> bool {
self.attributes
.borrow()
.get(local_name!("id"))
.map_or(false, |id_attr| {
case_sensitivity.eq(id.as_bytes(), id_attr.as_bytes())
})
}
#[inline]
fn has_class(&self, name: &LocalName, case_sensitivity: CaseSensitivity) -> bool {
let name = name.as_bytes();
!name.is_empty()
&& if let Some(class_attr) = self.attributes.borrow().get(local_name!("class")) {
class_attr
.split(SELECTOR_WHITESPACE)
.any(|class| case_sensitivity.eq(class.as_bytes(), name))
} else {
false
}
}
#[inline]
fn attr_matches(
&self,
ns: &NamespaceConstraint<&Namespace>,
local_name: &LocalName,
operation: &AttrSelectorOperation<&String>,
) -> bool {
let attrs = self.attributes.borrow();
match *ns {
NamespaceConstraint::Any => attrs
.map
.iter()
.any(|(name, attr)| name.local == *local_name && operation.eval_str(&attr.value)),
NamespaceConstraint::Specific(ns_url) => attrs
.map
.get(&ExpandedName::new(ns_url, local_name.clone()))
.map_or(false, |attr| operation.eval_str(&attr.value)),
}
}
fn match_pseudo_element(
&self,
pseudo: &PseudoElement,
_context: &mut matching::MatchingContext<KuchikiSelectors>,
) -> bool {
match *pseudo {}
}
fn match_non_ts_pseudo_class<F>(
&self,
pseudo: &PseudoClass,
_context: &mut matching::MatchingContext<KuchikiSelectors>,
_flags_setter: &mut F,
) -> bool
where
F: FnMut(&Self, matching::ElementSelectorFlags),
{
use self::PseudoClass::*;
match *pseudo {
Active | Focus | Hover | Enabled | Disabled | Checked | Indeterminate | Visited => {
false
}
AnyLink | Link => {
self.name.ns == ns!(html)
&& matches!(
self.name.local,
local_name!("a") | local_name!("area") | local_name!("link")
)
&& self.attributes.borrow().contains(local_name!("href"))
}
}
}
}
/// A pre-compiled list of CSS Selectors.
pub struct Selectors(pub Vec<Selector>);
/// A pre-compiled CSS Selector.
pub struct Selector(GenericSelector<KuchikiSelectors>);
/// The specificity of a selector.
///
/// Opaque, but ordered.
///
/// Determines precedence in the cascading algorithm.
/// When equal, a rule later in source order takes precedence.
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
pub struct Specificity(u32);
impl Selectors {
/// Compile a list of selectors. This may fail on syntax errors or unsupported selectors.
#[inline]
pub fn compile(s: &str) -> Result<Selectors, ()> {
let mut input = cssparser::ParserInput::new(s);
match SelectorList::parse(&KuchikiParser, &mut cssparser::Parser::new(&mut input)) {
Ok(list) => Ok(Selectors(list.0.into_iter().map(Selector).collect())),
Err(_) => Err(()),
}
}
/// Returns whether the given element matches this list of selectors.
#[inline]
pub fn matches(&self, element: &NodeDataRef<ElementData>) -> bool {
self.0.iter().any(|s| s.matches(element))
}
/// Filter an element iterator, yielding those matching this list of selectors.
#[inline]
pub fn filter<I>(&self, iter: I) -> Select<I, &Selectors>
where
I: Iterator<Item = NodeDataRef<ElementData>>,
{
Select {
iter,
selectors: self,
}
}
}
impl Selector {
/// Returns whether the given element matches this selector.
#[inline]
pub fn matches(&self, element: &NodeDataRef<ElementData>) -> bool {
let mut context = matching::MatchingContext::new(
matching::MatchingMode::Normal,
None,
None,
QuirksMode::NoQuirks,
);
matching::matches_selector(&self.0, 0, None, element, &mut context, &mut |_, _| {})
}
/// Return the specificity of this selector.
pub fn specificity(&self) -> Specificity {
Specificity(self.0.specificity())
}
}
impl ::std::str::FromStr for Selectors {
type Err = ();
#[inline]
fn from_str(s: &str) -> Result<Selectors, ()> {
Selectors::compile(s)
}
}
impl fmt::Display for Selector {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.0.to_css(f)
}
}
impl fmt::Display for Selectors {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut iter = self.0.iter();
let first = iter
.next()
.expect("Empty Selectors, should contain at least one selector");
first.0.to_css(f)?;
for selector in iter {
f.write_str(", ")?;
selector.0.to_css(f)?;
}
Ok(())
}
}
impl fmt::Debug for Selector {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
impl fmt::Debug for Selectors {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}

View File

@ -1,105 +0,0 @@
use html5ever::serialize::TraversalScope::*;
use html5ever::serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope};
use html5ever::QualName;
use std::fs::File;
use std::io::{Result, Write};
use std::path::Path;
use std::string::ToString;
use crate::tree::{NodeData, NodeRef};
impl Serialize for NodeRef {
fn serialize<S: Serializer>(
&self,
serializer: &mut S,
traversal_scope: TraversalScope,
) -> Result<()> {
match (traversal_scope, self.data()) {
(ref scope, &NodeData::Element(ref element)) => {
if *scope == IncludeNode {
let attrs = element.attributes.borrow();
// Unfortunately we need to allocate something to hold these &'a QualName
let attrs = attrs
.map
.iter()
.map(|(name, attr)| {
(
QualName::new(
attr.prefix.clone(),
name.ns.clone(),
name.local.clone(),
),
&attr.value,
)
})
.collect::<Vec<_>>();
serializer.start_elem(
element.name.clone(),
attrs.iter().map(|&(ref name, value)| (name, &**value)),
)?
}
for child in self.children() {
Serialize::serialize(&child, serializer, IncludeNode)?
}
if *scope == IncludeNode {
serializer.end_elem(element.name.clone())?
}
Ok(())
}
(_, &NodeData::DocumentFragment) | (_, &NodeData::Document(_)) => {
for child in self.children() {
Serialize::serialize(&child, serializer, IncludeNode)?
}
Ok(())
}
(ChildrenOnly(_), _) => Ok(()),
(IncludeNode, &NodeData::Doctype(ref doctype)) => {
serializer.write_doctype(&doctype.name)
}
(IncludeNode, &NodeData::Text(ref text)) => serializer.write_text(&text.borrow()),
(IncludeNode, &NodeData::Comment(ref text)) => serializer.write_comment(&text.borrow()),
(IncludeNode, &NodeData::ProcessingInstruction(ref contents)) => {
let contents = contents.borrow();
serializer.write_processing_instruction(&contents.0, &contents.1)
}
}
}
}
impl ToString for NodeRef {
#[inline]
fn to_string(&self) -> String {
let mut u8_vec = Vec::new();
self.serialize(&mut u8_vec).unwrap();
String::from_utf8(u8_vec).unwrap()
}
}
impl NodeRef {
/// Serialize this node and its descendants in HTML syntax to the given stream.
#[inline]
pub fn serialize<W: Write>(&self, writer: &mut W) -> Result<()> {
serialize(
writer,
self,
SerializeOpts {
traversal_scope: IncludeNode,
..Default::default()
},
)
}
/// Serialize this node and its descendants in HTML syntax to a new file at the given path.
#[inline]
pub fn serialize_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
let mut file = File::create(&path)?;
self.serialize(&mut file)
}
}

View File

@ -1,185 +0,0 @@
use html5ever::tree_builder::QuirksMode;
use html5ever::QualName;
use std::path::Path;
use tempfile::TempDir;
use crate::parser::{parse_html, parse_fragment};
use crate::select::*;
use crate::traits::*;
#[test]
fn text_nodes() {
let html = r"
<!doctype html>
<title>Test case</title>
<p>Content contains <b>Important</b> data</p>";
let document = parse_html().one(html);
let paragraph = document.select("p").unwrap().collect::<Vec<_>>();
assert_eq!(paragraph.len(), 1);
assert_eq!(
paragraph[0].text_contents(),
"Content contains Important data"
);
let texts = paragraph[0]
.as_node()
.descendants()
.text_nodes()
.collect::<Vec<_>>();
assert_eq!(texts.len(), 3);
assert_eq!(&*texts[0].borrow(), "Content contains ");
assert_eq!(&*texts[1].borrow(), "Important");
assert_eq!(&*texts[2].borrow(), " data");
{
let mut x = texts[0].borrow_mut();
x.truncate(0);
x.push_str("Content doesn't contain ");
}
assert_eq!(&*texts[0].borrow(), "Content doesn't contain ");
}
#[test]
fn parse_and_serialize() {
let html = r"
<!doctype html>
<title>Test case</title>
<p>Content";
let document = parse_html().one(html);
assert_eq!(
document.as_document().unwrap().quirks_mode(),
QuirksMode::NoQuirks
);
assert_eq!(
document.to_string(),
r"<!DOCTYPE html><html><head><title>Test case</title>
</head><body><p>Content</p></body></html>"
);
}
#[test]
fn parse_and_serialize_fragment() {
let html = r"<tbody><tr><td>Test case";
let ctx_name = QualName::new(None, ns!(html), local_name!("tbody"));
let document = parse_fragment(ctx_name, vec![]).one(html);
assert_eq!(document.as_document().unwrap().quirks_mode(), QuirksMode::NoQuirks);
assert_eq!(document.to_string(), r"<html><tr><td>Test case</td></tr></html>");
}
#[test]
fn parse_file() {
let mut path = Path::new(env!("CARGO_MANIFEST_DIR")).to_path_buf();
path.push("test_data".to_string());
path.push("foo.html");
let html = r"<!DOCTYPE html><html><head>
<title>Test case</title>
</head>
<body>
<p>Foo</p>
</body></html>";
let document = parse_html().from_utf8().from_file(&path).unwrap();
assert_eq!(document.to_string(), html);
}
#[test]
fn serialize_and_read_file() {
let tempdir = TempDir::new().unwrap();
let mut path = tempdir.path().to_path_buf();
path.push("temp.html");
let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>";
let document = parse_html().one(html);
let _ = document.serialize_to_file(path.clone());
let document2 = parse_html().from_utf8().from_file(&path).unwrap();
assert_eq!(document.to_string(), document2.to_string());
}
#[test]
fn select() {
let html = r"
<title>Test case</title>
<p class=foo>Foo
<p>Bar
<p class=foo>Foo
";
let document = parse_html().one(html);
let matching = document.select("p.foo").unwrap().collect::<Vec<_>>();
assert_eq!(matching.len(), 2);
let child = matching[0].as_node().first_child().unwrap();
assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
assert_eq!(matching[0].attributes.borrow().get("class"), Some("foo"));
assert_eq!(
matching[0].attributes.borrow().get(local_name!("class")),
Some("foo")
);
let selectors = Selectors::compile("p.foo").unwrap();
let matching2 = selectors
.filter(document.descendants().elements())
.collect::<Vec<_>>();
assert_eq!(matching, matching2);
}
#[test]
fn select_first() {
let html = r"
<title>Test case</title>
<p class=foo>Foo
<p>Bar
<p class=foo>Baz
";
let document = parse_html().one(html);
let matching = document.select_first("p.foo").unwrap();
let child = matching.as_node().first_child().unwrap();
assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
assert_eq!(matching.attributes.borrow().get("class"), Some("foo"));
assert_eq!(
matching.attributes.borrow().get(local_name!("class")),
Some("foo")
);
assert!(document.select_first("p.bar").is_err());
}
#[test]
fn to_string() {
let html = r"<!DOCTYPE html>
<html>
<head>
<title>Test case</title>
</head>
<body>
<p class=foo>Foo
</body>
</html>";
let document = parse_html().one(html);
assert_eq!(
document
.inclusive_descendants()
.nth(11)
.unwrap()
.to_string(),
"<p class=\"foo\">Foo\n \n</p>"
);
}
#[test]
fn specificity() {
let selectors = Selectors::compile(".example, :first-child, div").unwrap();
let specificities = selectors
.0
.iter()
.map(|s| s.specificity())
.collect::<Vec<_>>();
assert_eq!(specificities.len(), 3);
assert!(specificities[0] == specificities[1]);
assert!(specificities[0] > specificities[2]);
assert!(specificities[1] > specificities[2]);
}

View File

@ -1,489 +0,0 @@
use html5ever::tree_builder::QuirksMode;
use html5ever::QualName;
use std::cell::{Cell, RefCell};
use std::fmt;
use std::ops::Deref;
use std::rc::{Rc, Weak};
use crate::attributes::{Attribute, Attributes, ExpandedName};
use crate::cell_extras::*;
use crate::iter::NodeIterator;
/// Node data specific to the node type.
#[derive(Debug, PartialEq, Clone)]
pub enum NodeData {
/// Element node
Element(ElementData),
/// Text node
Text(RefCell<String>),
/// Comment node
Comment(RefCell<String>),
/// Processing instruction node
ProcessingInstruction(RefCell<(String, String)>),
/// Doctype node
Doctype(Doctype),
/// Document node
Document(DocumentData),
/// Document fragment node
DocumentFragment,
}
/// Data specific to doctype nodes.
#[derive(Debug, PartialEq, Clone)]
pub struct Doctype {
/// The name of the doctype
pub name: String,
/// The public ID of the doctype
pub public_id: String,
/// The system ID of the doctype
pub system_id: String,
}
/// Data specific to element nodes.
#[derive(Debug, PartialEq, Clone)]
pub struct ElementData {
/// The namespace and local name of the element, such as `ns!(html)` and `body`.
pub name: QualName,
/// The attributes of the elements.
pub attributes: RefCell<Attributes>,
/// If the element is an HTML `<template>` element,
/// the document fragment node that is the root of template contents.
pub template_contents: Option<NodeRef>,
}
/// Data specific to document nodes.
#[derive(Debug, PartialEq, Clone)]
pub struct DocumentData {
#[doc(hidden)]
pub _quirks_mode: Cell<QuirksMode>,
}
impl DocumentData {
/// The quirks mode of the document, as determined by the HTML parser.
#[inline]
pub fn quirks_mode(&self) -> QuirksMode {
self._quirks_mode.get()
}
}
/// A strong reference to a node.
///
/// A node is destroyed when the last strong reference to it dropped.
///
/// Each node holds a strong reference to its first child and next sibling (if any),
/// but only a weak reference to its last child, previous sibling, and parent.
/// This is to avoid strong reference cycles, which would cause memory leaks.
///
/// As a result, a single `NodeRef` is sufficient to keep alive a node
/// and nodes that are after it in tree order
/// (its descendants, its following siblings, and their descendants)
/// but not other nodes in a tree.
///
/// To avoid detroying nodes prematurely,
/// programs typically hold a strong reference to the root of a document
/// until theyre done with that document.
#[derive(Clone, Debug)]
pub struct NodeRef(pub Rc<Node>);
impl Deref for NodeRef {
type Target = Node;
#[inline]
fn deref(&self) -> &Node {
&*self.0
}
}
impl Eq for NodeRef {}
impl PartialEq for NodeRef {
#[inline]
fn eq(&self, other: &NodeRef) -> bool {
let a: *const Node = &*self.0;
let b: *const Node = &*other.0;
a == b
}
}
/// A node inside a DOM-like tree.
pub struct Node {
parent: Cell<Option<Weak<Node>>>,
previous_sibling: Cell<Option<Weak<Node>>>,
next_sibling: Cell<Option<Rc<Node>>>,
first_child: Cell<Option<Rc<Node>>>,
last_child: Cell<Option<Weak<Node>>>,
data: NodeData,
}
impl fmt::Debug for Node {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "{:?} @ {:?}", self.data, self as *const Node)
}
}
/// Prevent implicit recursion when dropping nodes to avoid overflowing the stack.
///
/// The implicit drop is correct, but recursive.
/// In the worst case (where no node has both a next sibling and a child),
/// a tree of a few tens of thousands of nodes could cause a stack overflow.
///
/// This `Drop` implementations makes sure the recursion does not happen.
/// Instead, it has an explicit `Vec<Rc<Node>>` stack to traverse the subtree,
/// but only following `Rc<Node>` references that are "unique":
/// that have a strong reference count of 1.
/// Those are the nodes that would have been dropped recursively.
///
/// The stack holds ancestors of the current node rather than preceding siblings,
/// on the assumption that large document trees are typically wider than deep.
impl Drop for Node {
fn drop(&mut self) {
// `.take_if_unique_strong()` temporarily leaves the tree in an inconsistent state,
// as the corresponding `Weak` reference in the other direction is not removed.
// It is important that all `Some(_)` strong references it returns
// are dropped by the end of this `drop` call,
// and that no user code is invoked in-between.
// Sharing `stack` between these two calls is not necessary,
// but it allows re-using memory allocations.
let mut stack = Vec::new();
if let Some(rc) = self.first_child.take_if_unique_strong() {
non_recursive_drop_unique_rc(rc, &mut stack);
}
if let Some(rc) = self.next_sibling.take_if_unique_strong() {
non_recursive_drop_unique_rc(rc, &mut stack);
}
fn non_recursive_drop_unique_rc(mut rc: Rc<Node>, stack: &mut Vec<Rc<Node>>) {
loop {
if let Some(child) = rc.first_child.take_if_unique_strong() {
stack.push(rc);
rc = child;
continue;
}
if let Some(sibling) = rc.next_sibling.take_if_unique_strong() {
// The previous value of `rc: Rc<Node>` is dropped here.
// Since it was unique, the corresponding `Node` is dropped as well.
// `<Node as Drop>::drop` does not call `drop_rc`
// as both the first child and next sibling were already taken.
// Weak reference counts decremented here for `Cell`s that are `Some`:
// * `rc.parent`: still has a strong reference in `stack` or elsewhere
// * `rc.last_child`: this is the last weak ref. Deallocated now.
// * `rc.previous_sibling`: this is the last weak ref. Deallocated now.
rc = sibling;
continue;
}
if let Some(parent) = stack.pop() {
// Same as in the above comment.
rc = parent;
continue;
}
return;
}
}
}
}
impl NodeRef {
/// Create a new node.
#[inline]
pub fn new(data: NodeData) -> NodeRef {
NodeRef(Rc::new(Node {
parent: Cell::new(None),
first_child: Cell::new(None),
last_child: Cell::new(None),
previous_sibling: Cell::new(None),
next_sibling: Cell::new(None),
data,
}))
}
/// Create a new element node.
#[inline]
pub fn new_element<I>(name: QualName, attributes: I) -> NodeRef
where
I: IntoIterator<Item = (ExpandedName, Attribute)>,
{
NodeRef::new(NodeData::Element(ElementData {
template_contents: if name.expanded() == expanded_name!(html "template") {
Some(NodeRef::new(NodeData::DocumentFragment))
} else {
None
},
name,
attributes: RefCell::new(Attributes {
map: attributes.into_iter().collect(),
}),
}))
}
/// Create a new text node.
#[inline]
pub fn new_text<T: Into<String>>(value: T) -> NodeRef {
NodeRef::new(NodeData::Text(RefCell::new(value.into())))
}
/// Create a new comment node.
#[inline]
pub fn new_comment<T: Into<String>>(value: T) -> NodeRef {
NodeRef::new(NodeData::Comment(RefCell::new(value.into())))
}
/// Create a new processing instruction node.
#[inline]
pub fn new_processing_instruction<T1, T2>(target: T1, data: T2) -> NodeRef
where
T1: Into<String>,
T2: Into<String>,
{
NodeRef::new(NodeData::ProcessingInstruction(RefCell::new((
target.into(),
data.into(),
))))
}
/// Create a new doctype node.
#[inline]
pub fn new_doctype<T1, T2, T3>(name: T1, public_id: T2, system_id: T3) -> NodeRef
where
T1: Into<String>,
T2: Into<String>,
T3: Into<String>,
{
NodeRef::new(NodeData::Doctype(Doctype {
name: name.into(),
public_id: public_id.into(),
system_id: system_id.into(),
}))
}
/// Create a new document node.
#[inline]
pub fn new_document() -> NodeRef {
NodeRef::new(NodeData::Document(DocumentData {
_quirks_mode: Cell::new(QuirksMode::NoQuirks),
}))
}
/// Return the concatenation of all text nodes in this subtree.
pub fn text_contents(&self) -> String {
let mut s = String::new();
for text_node in self.inclusive_descendants().text_nodes() {
s.push_str(&text_node.borrow());
}
s
}
}
impl Node {
/// Return a reference to this nodes node-type-specific data.
#[inline]
pub fn data(&self) -> &NodeData {
&self.data
}
/// If this node is an element, return a reference to element-specific data.
#[inline]
pub fn as_element(&self) -> Option<&ElementData> {
match self.data {
NodeData::Element(ref value) => Some(value),
_ => None,
}
}
/// If this node is a text node, return a reference to its contents.
#[inline]
pub fn as_text(&self) -> Option<&RefCell<String>> {
match self.data {
NodeData::Text(ref value) => Some(value),
_ => None,
}
}
/// If this node is a comment, return a reference to its contents.
#[inline]
pub fn as_comment(&self) -> Option<&RefCell<String>> {
match self.data {
NodeData::Comment(ref value) => Some(value),
_ => None,
}
}
/// If this node is a document, return a reference to doctype-specific data.
#[inline]
pub fn as_doctype(&self) -> Option<&Doctype> {
match self.data {
NodeData::Doctype(ref value) => Some(value),
_ => None,
}
}
/// If this node is a document, return a reference to document-specific data.
#[inline]
pub fn as_document(&self) -> Option<&DocumentData> {
match self.data {
NodeData::Document(ref value) => Some(value),
_ => None,
}
}
/// Return a reference to the parent node, unless this node is the root of the tree.
#[inline]
pub fn parent(&self) -> Option<NodeRef> {
self.parent.upgrade().map(NodeRef)
}
/// Return a reference to the first child of this node, unless it has no child.
#[inline]
pub fn first_child(&self) -> Option<NodeRef> {
self.first_child.clone_inner().map(NodeRef)
}
/// Return a reference to the last child of this node, unless it has no child.
#[inline]
pub fn last_child(&self) -> Option<NodeRef> {
self.last_child.upgrade().map(NodeRef)
}
/// Return a reference to the previous sibling of this node, unless it is a first child.
#[inline]
pub fn previous_sibling(&self) -> Option<NodeRef> {
self.previous_sibling.upgrade().map(NodeRef)
}
/// Return a reference to the next sibling of this node, unless it is a last child.
#[inline]
pub fn next_sibling(&self) -> Option<NodeRef> {
self.next_sibling.clone_inner().map(NodeRef)
}
/// Detach a node from its parent and siblings. Children are not affected.
///
/// To remove a node and its descendants, detach it and drop any strong reference to it.
pub fn detach(&self) {
let parent_weak = self.parent.take();
let previous_sibling_weak = self.previous_sibling.take();
let next_sibling_strong = self.next_sibling.take();
let previous_sibling_opt = previous_sibling_weak
.as_ref()
.and_then(|weak| weak.upgrade());
if let Some(next_sibling_ref) = next_sibling_strong.as_ref() {
next_sibling_ref
.previous_sibling
.replace(previous_sibling_weak);
} else if let Some(parent_ref) = parent_weak.as_ref() {
if let Some(parent_strong) = parent_ref.upgrade() {
parent_strong.last_child.replace(previous_sibling_weak);
}
}
if let Some(previous_sibling_strong) = previous_sibling_opt {
previous_sibling_strong
.next_sibling
.replace(next_sibling_strong);
} else if let Some(parent_ref) = parent_weak.as_ref() {
if let Some(parent_strong) = parent_ref.upgrade() {
parent_strong.first_child.replace(next_sibling_strong);
}
}
}
}
impl NodeRef {
/// Append a new child to this node, after existing children.
///
/// The new child is detached from its previous position.
pub fn append(&self, new_child: NodeRef) {
new_child.detach();
new_child.parent.replace(Some(Rc::downgrade(&self.0)));
if let Some(last_child_weak) = self.last_child.replace(Some(Rc::downgrade(&new_child.0))) {
if let Some(last_child) = last_child_weak.upgrade() {
new_child.previous_sibling.replace(Some(last_child_weak));
debug_assert!(last_child.next_sibling.is_none());
last_child.next_sibling.replace(Some(new_child.0));
return;
}
}
debug_assert!(self.first_child.is_none());
self.first_child.replace(Some(new_child.0));
}
/// Prepend a new child to this node, before existing children.
///
/// The new child is detached from its previous position.
pub fn prepend(&self, new_child: NodeRef) {
new_child.detach();
new_child.parent.replace(Some(Rc::downgrade(&self.0)));
if let Some(first_child) = self.first_child.take() {
debug_assert!(first_child.previous_sibling.is_none());
first_child
.previous_sibling
.replace(Some(Rc::downgrade(&new_child.0)));
new_child.next_sibling.replace(Some(first_child));
} else {
debug_assert!(self.first_child.is_none());
self.last_child.replace(Some(Rc::downgrade(&new_child.0)));
}
self.first_child.replace(Some(new_child.0));
}
/// Insert a new sibling after this node.
///
/// The new sibling is detached from its previous position.
pub fn insert_after(&self, new_sibling: NodeRef) {
new_sibling.detach();
new_sibling.parent.replace(self.parent.clone_inner());
new_sibling
.previous_sibling
.replace(Some(Rc::downgrade(&self.0)));
if let Some(next_sibling) = self.next_sibling.take() {
debug_assert!(next_sibling.previous_sibling().unwrap() == *self);
next_sibling
.previous_sibling
.replace(Some(Rc::downgrade(&new_sibling.0)));
new_sibling.next_sibling.replace(Some(next_sibling));
} else if let Some(parent) = self.parent() {
debug_assert!(parent.last_child().unwrap() == *self);
parent
.last_child
.replace(Some(Rc::downgrade(&new_sibling.0)));
}
self.next_sibling.replace(Some(new_sibling.0));
}
/// Insert a new sibling before this node.
///
/// The new sibling is detached from its previous position.
pub fn insert_before(&self, new_sibling: NodeRef) {
new_sibling.detach();
new_sibling.parent.replace(self.parent.clone_inner());
new_sibling.next_sibling.replace(Some(self.0.clone()));
if let Some(previous_sibling_weak) = self
.previous_sibling
.replace(Some(Rc::downgrade(&new_sibling.0)))
{
if let Some(previous_sibling) = previous_sibling_weak.upgrade() {
new_sibling
.previous_sibling
.replace(Some(previous_sibling_weak));
debug_assert!(previous_sibling.next_sibling().unwrap() == *self);
previous_sibling.next_sibling.replace(Some(new_sibling.0));
return;
}
}
if let Some(parent) = self.parent() {
debug_assert!(parent.first_child().unwrap() == *self);
parent.first_child.replace(Some(new_sibling.0));
}
}
}

View File

@ -1,9 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<title>Test case</title>
</head>
<body>
<p>Foo</p>
</body>
</html>

View File

@ -1,31 +0,0 @@
name: Build
on: [push, pull_request]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- macOS-latest
- windows-latest
rust:
- stable
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: ${{ matrix.rust }}
override: true
- name: Build
run: |
cargo build --all-targets --no-default-features --verbose
cargo build --all-targets --verbose
- name: Run tests
run: cargo test --all-targets --verbose
env:
RUST_BACKTRACE: 1

View File

@ -1,27 +0,0 @@
name: Coverage
on:
pull_request:
push:
branches:
- master
jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- uses: actions-rs/install@v0.1
with:
crate: cargo-tarpaulin
use-tool-cache: true
- name: Run coverage
run: cargo tarpaulin -f -t 5 --out Xml -v -- --test-threads=1
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
token: ${{secrets.CODECOV_TOKEN}}

View File

@ -1,24 +0,0 @@
name: Style check
on: [push, pull_request]
jobs:
clippy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install clippy
uses: actions-rs/toolchain@v1
with:
toolchain: stable
components: clippy
- uses: actions-rs/clippy-check@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
args: --all --all-features
fmt:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: Run fmt check
run: cargo fmt --all -- --check

View File

@ -1,4 +0,0 @@
/target/
**/*.rs.bk
Cargo.lock
/.vscode

View File

@ -1,16 +0,0 @@
[package]
name = "sanitize_html"
version = "0.7.0"
authors = ["Andrey Kutejko <andy128k@gmail.com>"]
description = "Rule-based HTML Sanitization library"
keywords = ["html", "sanitize"]
license = "MIT"
homepage = "https://github.com/andy128k/sanitize-html-rs"
repository = "https://github.com/andy128k/sanitize-html-rs.git"
edition = "2018"
[dependencies]
regex = "^1.5.6"
lazy_static = "^1.4.0"
html5ever = "^0.26"
kuchiki = { path = "../kuchiki" }

View File

@ -1,18 +0,0 @@
Copyright (c) 2017 Andrey Kutejko
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,8 +0,0 @@
# Sanitize HTML
[![Crates.io Status](https://img.shields.io/crates/v/sanitize_html.svg)](https://crates.io/crates/sanitize_html)
[![Build](https://github.com/andy128k/sanitize-html-rs/workflows/Build/badge.svg?branch=master&event=push)](https://github.com/andy128k/sanitize-html-rs/actions?query=workflow%3ABuild)
[![codecov](https://codecov.io/gh/andy128k/sanitize-html-rs/branch/master/graph/badge.svg)](https://codecov.io/gh/andy128k/sanitize-html-rs)
[![dependency status](https://deps.rs/repo/github/andy128k/sanitize-html-rs/status.svg)](https://deps.rs/repo/github/andy128k/sanitize-html-rs)
This is a library for sanitization of HTML fragments.

View File

@ -1,37 +0,0 @@
//! Error types, which can be emited by sanitization procedure.
use std::error::Error;
use std::fmt;
/// Sanitization error
#[derive(Debug)]
pub enum SanitizeError {
/// UTF-8 decoding error
StrUtf8Error(std::str::Utf8Error),
/// UTF-8 decoding error
Utf8Error(std::string::FromUtf8Error),
/// Serialization error
SerializeError(std::io::Error),
}
impl fmt::Display for SanitizeError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SanitizeError::StrUtf8Error(e) => write!(f, "UTF-8 decode error {}", e),
SanitizeError::Utf8Error(e) => write!(f, "UTF-8 decode error {}", e),
SanitizeError::SerializeError(e) => write!(f, "Serialization error {}", e),
}
}
}
impl Error for SanitizeError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self {
SanitizeError::StrUtf8Error(e) => Some(e),
SanitizeError::Utf8Error(e) => Some(e),
SanitizeError::SerializeError(e) => Some(e),
}
}
}

View File

@ -1,42 +0,0 @@
//! HTML Sanitization library
//!
//! # Examples
//!
//! ```
//! use sanitize_html::sanitize_str;
//! use sanitize_html::rules::predefined::DEFAULT;
//!
//! let input = "<b>Lo<!-- comment -->rem</b> <a href=\"pants\" title=\"foo\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br/>amet <script>alert(\"hello world\");</script>";
//!
//! let sanitized_default: String = sanitize_str(&DEFAULT, input).unwrap();
//! assert_eq!(&sanitized_default, "Lorem ipsum dolor sit amet ");
//! ```
#![deny(missing_docs)]
pub mod errors;
mod parse;
pub mod rules;
mod sanitize;
mod tests;
use crate::errors::SanitizeError;
use crate::rules::Rules;
/// Sanitize HTML bytes
pub fn sanitize_bytes(rules: &Rules, input: &[u8]) -> Result<Vec<u8>, SanitizeError> {
let input_str = std::str::from_utf8(input).map_err(SanitizeError::StrUtf8Error)?;
let dom = parse::parse_str(input_str);
let new_dom = sanitize::sanitize_dom(&dom, rules);
let result_bytes = parse::unparse_bytes(&new_dom)?;
Ok(result_bytes)
}
/// Sanitize HTML string
pub fn sanitize_str(rules: &Rules, input: &str) -> Result<String, SanitizeError> {
let dom = parse::parse_str(input);
let new_dom = sanitize::sanitize_dom(&dom, rules);
let result_bytes = parse::unparse_bytes(&new_dom)?;
let result_string = String::from_utf8(result_bytes).map_err(SanitizeError::Utf8Error)?;
Ok(result_string)
}

View File

@ -1,38 +0,0 @@
use super::errors::SanitizeError;
use html5ever::{
interface::QualName,
local_name, namespace_prefix, namespace_url, ns, serialize,
serialize::{SerializeOpts, TraversalScope},
tendril::TendrilSink,
};
use kuchiki::{parse_html_with_options, NodeRef, ParseOpts};
use std::default::Default;
pub(crate) fn parse_str(input: &str) -> NodeRef {
let mut opts = ParseOpts::default();
opts.tree_builder.drop_doctype = true;
let mut parser = parse_html_with_options(opts);
parser.process(input.into());
parser.finish()
}
pub(crate) fn unparse_bytes(dom: &NodeRef) -> Result<Vec<u8>, SanitizeError> {
let mut buf: Vec<u8> = Vec::new();
let parent = QualName::new(
Some(namespace_prefix!("html")),
ns!(html),
local_name!("div"),
);
let opts = SerializeOpts {
scripting_enabled: false,
traversal_scope: TraversalScope::ChildrenOnly(Some(parent)),
create_missing_parent: false,
};
serialize(&mut buf, dom, opts).map_err(SanitizeError::SerializeError)?;
Ok(buf)
}

View File

@ -1,141 +0,0 @@
//! Structures to define sanitization rules.
pub mod pattern;
pub mod predefined;
use self::pattern::Pattern;
use std::collections::HashMap;
use std::collections::HashSet;
/// structure to describe HTML element
pub struct Element {
/// name of an element
pub name: String,
/// Whitelist of allowed attributes
pub attributes: HashMap<String, Pattern>,
/// List of mandatory atributes and their values.
/// These attributes will be forcibly added to element.
pub mandatory_attributes: HashMap<String, String>,
/// Attribute rules
pub attribute_rules: AttributeRules,
}
impl Element {
/// Creates element descriptor
pub fn new(name: &str) -> Self {
Self {
name: name.to_owned(),
attributes: HashMap::new(),
mandatory_attributes: HashMap::new(),
attribute_rules: AttributeRules::new(),
}
}
/// Adds an attribute
pub fn attribute(mut self, attribute: &str, pattern: Pattern) -> Self {
self.attributes.insert(attribute.to_owned(), pattern);
self
}
/// Adds mandatory attribute
pub fn mandatory_attribute(mut self, attribute: &str, value: &str) -> Self {
self.mandatory_attributes
.insert(attribute.to_owned(), value.to_owned());
self
}
/// Checks if attribute is valid
pub fn is_valid(&self, attribute: &str, value: &str) -> bool {
match self.attributes.get(attribute) {
None => false,
Some(pattern) => pattern.matches(value),
}
}
}
/// structure to describe sanitization rules
#[derive(Default)]
pub struct Rules {
/// Determines if comments are kept of stripped out of a document.
pub allow_comments: bool,
/// Allowed elements.
pub allowed_elements: HashMap<String, Element>,
/// Elements which will be removed together with their children.
pub delete_elements: HashSet<String>,
/// Elements which will be replaced by spaces (Their children will be processed recursively).
pub space_elements: HashSet<String>,
/// Elements which will be renamed.
pub rename_elements: HashMap<String, String>,
}
impl Rules {
/// Creates a new rules set.
pub fn new() -> Self {
Self::default()
}
/// Sets if comments are allowed
pub fn allow_comments(mut self, allow_comments: bool) -> Self {
self.allow_comments = allow_comments;
self
}
/// Adds a rule for an allowed element
pub fn element(mut self, element: Element) -> Self {
self.allowed_elements.insert(element.name.clone(), element);
self
}
/// Adds a rule to delete an element
pub fn delete(mut self, element_name: &str) -> Self {
self.delete_elements.insert(element_name.to_owned());
self
}
/// Adds a rule to replace an element with space
pub fn space(mut self, element_name: &str) -> Self {
self.space_elements.insert(element_name.to_owned());
self
}
/// Adds a rule to rename an element
pub fn rename(mut self, element_name: &str, to: &str) -> Self {
self.rename_elements
.insert(element_name.to_owned(), to.to_owned());
self
}
}
/// Structure to define rules for attributes
#[derive(Default)]
pub struct AttributeRules {
/// Atrributes which will be renamed.
pub rename_attributes: HashMap<String, String>,
/// Functions to modify attribute contents
pub modify_attributes: HashMap<String, Box<dyn Fn(String) -> String + Sync>>,
}
impl AttributeRules {
/// Create a new attribute rules set.
pub fn new() -> Self {
Self::default()
}
/// Adds a rule to rename an attribute
pub fn rename(&mut self, attribute_name: &str, to: &str) -> &Self {
self.rename_attributes
.insert(attribute_name.to_owned(), to.to_owned());
self
}
/// Adds a rule with a function to modify the contents of an attribute
pub fn modify(
&mut self,
attribute_name: &str,
function: Box<dyn Fn(String) -> String + Sync>,
) -> &Self {
self.modify_attributes
.insert(attribute_name.to_owned(), function);
self
}
}

View File

@ -1,127 +0,0 @@
//! This module contains code dedicated to check validity of attribute's value.
//!
//! # Examples
//! ```
//! use sanitize_html::rules::pattern::Pattern;
//! use regex::Regex;
//!
//! let href = Pattern::regex(Regex::new("^(ftp:|http:|https:|mailto:)").unwrap()) |
//! !Pattern::regex(Regex::new("^[^/]+[[:space:]]*:").unwrap());
//!
//! assert!(href.matches("filename.xls"));
//! assert!(href.matches("http://foo.com/"));
//! assert!(href.matches(" filename with spaces .zip "));
//! assert!(!href.matches(" javascript : window.location = '//example.com/'")); // Attempt to make XSS
//! ```
use regex::Regex;
/// Value pattern
pub struct Pattern(pub Box<dyn Fn(&str) -> bool + Sync + Send>);
impl Pattern {
/// Creates pattern which accepts any value.
///
/// # Example
/// ```
/// use sanitize_html::rules::pattern::Pattern;
/// use regex::Regex;
///
/// let pattern = Pattern::any();
/// assert!(pattern.matches(""));
/// assert!(pattern.matches("pants"));
/// ```
pub fn any() -> Self {
Pattern(Box::new(move |_value| true))
}
/// Creates pattern which uses regular expression to check a value. Panics
///
/// # Example
/// ```
/// use sanitize_html::rules::pattern::Pattern;
/// use regex::Regex;
///
/// let pattern = Pattern::regex(Regex::new("ant").unwrap());
/// assert!(!pattern.matches(""));
/// assert!(pattern.matches("pants"));
/// ```
pub fn regex(re: Regex) -> Self {
Pattern(Box::new(move |value| re.is_match(value)))
}
/// Checks if a value matches to a pattern.
pub fn matches(&self, value: &str) -> bool {
(self.0)(value)
}
}
impl ::std::ops::Not for Pattern {
type Output = Pattern;
/// Negates pattern
///
/// # Example
/// ```
/// use sanitize_html::rules::pattern::Pattern;
/// use regex::Regex;
///
/// let pattern = !Pattern::any();
/// assert!(!pattern.matches(""));
/// assert!(!pattern.matches("pants"));
/// ```
fn not(self) -> Self::Output {
let cb = self.0;
Pattern(Box::new(move |value| !cb(value)))
}
}
impl ::std::ops::BitAnd for Pattern {
type Output = Pattern;
/// Combines two patterns into a pattern which matches a string iff both patterns match that string.
///
/// # Example
/// ```
/// use sanitize_html::rules::pattern::Pattern;
/// use regex::Regex;
///
/// let pan = Pattern::regex(Regex::new("pan").unwrap());
/// let ant = Pattern::regex(Regex::new("ant").unwrap());
/// let pattern = pan & ant;
///
/// assert!(!pattern.matches("pan"));
/// assert!(!pattern.matches("ant"));
/// assert!(pattern.matches("pants"));
/// ```
fn bitand(self, rhs: Pattern) -> Self::Output {
let cb1 = self.0;
let cb2 = rhs.0;
Pattern(Box::new(move |value| cb1(value) && cb2(value)))
}
}
impl ::std::ops::BitOr for Pattern {
type Output = Pattern;
/// Combines two patterns into a pattern which matches a string if one of patterns matches that string.
///
/// # Example
/// ```
/// use sanitize_html::rules::pattern::Pattern;
/// use regex::Regex;
///
/// let pan = Pattern::regex(Regex::new("pan").unwrap());
/// let pot = Pattern::regex(Regex::new("pot").unwrap());
/// let pattern = pan | pot;
///
/// assert!(pattern.matches("pants"));
/// assert!(pattern.matches("pot"));
/// assert!(!pattern.matches("jar"));
/// ```
fn bitor(self, rhs: Pattern) -> Self::Output {
let cb1 = self.0;
let cb2 = rhs.0;
Pattern(Box::new(move |value| cb1(value) || cb2(value)))
}
}

View File

@ -1,380 +0,0 @@
//! Predefined rules
//!
//! These rules are inspired by a great Ruby gem [sanitize](https://github.com/rgrove/sanitize/).
use super::pattern::Pattern;
use super::{Element, Rules};
use lazy_static::lazy_static;
use regex::Regex;
fn re(regex: &str) -> Pattern {
Pattern::regex(Regex::new(regex).unwrap())
}
fn href() -> Pattern {
re("^(ftp:|http:|https:|mailto:)") | !re("^[^/]+[[:space:]]*:")
}
fn src() -> Pattern {
re("^(http:|https:)") | !re("^[^/]+[[:space:]]*:")
}
lazy_static! {
/// Basic rules. Allows a variety of markup including formatting elements, links, and lists.
pub static ref BASIC: Rules = basic();
/// Default rules. Removes all tags.
pub static ref DEFAULT: Rules = default();
/// Relaxed rules. Allows an even wider variety of markup, including images and tables
pub static ref RELAXED: Rules = relaxed();
/// Restricted rules. Allows only very simple inline markup. No links, images, or block elements.
pub static ref RESTRICTED: Rules = restricted();
/// Rules for document from untrusted sources. Removes all tags but text emphasizing and links.
pub static ref UNTRUSTED: Rules = untrusted();
}
/// Basic rules. Allows a variety of markup including formatting elements, links, and lists.
pub fn basic() -> Rules {
Rules::new()
.element(Element::new("a").attribute("href", href()))
.element(Element::new("abbr").attribute("title", Pattern::any()))
.element(Element::new("b"))
.element(Element::new("blockquote").attribute("cite", src()))
.element(Element::new("br"))
.element(Element::new("br"))
.element(Element::new("cite"))
.element(Element::new("code"))
.element(Element::new("dd"))
.element(Element::new("dfn").attribute("title", Pattern::any()))
.element(Element::new("dl"))
.element(Element::new("dt"))
.element(Element::new("em"))
.element(Element::new("i"))
.element(Element::new("kbd"))
.element(Element::new("li"))
.element(Element::new("mark"))
.element(Element::new("ol"))
.element(Element::new("p"))
.element(Element::new("pre"))
.element(Element::new("q").attribute("cite", src()))
.element(Element::new("s"))
.element(Element::new("samp"))
.element(Element::new("small"))
.element(Element::new("strike"))
.element(Element::new("strong"))
.element(Element::new("sub"))
.element(Element::new("sup"))
.element(
Element::new("time")
.attribute("datetime", Pattern::any())
.attribute("pubdate", Pattern::any()),
)
.element(Element::new("u"))
.element(Element::new("ul"))
.element(Element::new("var"))
.space("address")
.space("article")
.space("aside")
.space("div")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("nav")
.space("section")
.delete("element_name")
}
/// Default rules. Removes all tags.
pub fn default() -> Rules {
Rules::new()
.space("address")
.space("article")
.space("aside")
.space("blockquote")
.space("br")
.space("dd")
.space("div")
.space("dl")
.space("dt")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("li")
.space("nav")
.space("ol")
.space("p")
.space("pre")
.space("section")
.space("ul")
.delete("iframe")
.delete("noembed")
.delete("noframes")
.delete("noscript")
.delete("script")
.delete("style")
}
/// Relaxed rules. Allows an even wider variety of markup, including images and tables
pub fn relaxed() -> Rules {
fn relaxed_element(name: &str) -> Element {
Element::new(name)
.attribute("dir", Pattern::any())
.attribute("lang", Pattern::any())
.attribute("title", Pattern::any())
.attribute("class", Pattern::any())
}
Rules::new()
.element(relaxed_element("a").attribute("href", href()))
.element(relaxed_element("abbr"))
.element(relaxed_element("b"))
.element(relaxed_element("bdo"))
.element(relaxed_element("blockquote").attribute("cite", src()))
.element(relaxed_element("br"))
.element(relaxed_element("caption"))
.element(relaxed_element("cite"))
.element(relaxed_element("code"))
.element(
relaxed_element("col")
.attribute("span", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(
relaxed_element("colgroup")
.attribute("span", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("dd"))
.element(
relaxed_element("del")
.attribute("cite", src())
.attribute("datetime", Pattern::any()),
)
.element(relaxed_element("dfn"))
.element(relaxed_element("dl"))
.element(relaxed_element("dt"))
.element(relaxed_element("em"))
.element(relaxed_element("figcaption"))
.element(relaxed_element("figure"))
.element(relaxed_element("h1"))
.element(relaxed_element("h2"))
.element(relaxed_element("h3"))
.element(relaxed_element("h4"))
.element(relaxed_element("h5"))
.element(relaxed_element("h6"))
.element(relaxed_element("hgroup"))
.element(relaxed_element("i"))
.element(
relaxed_element("img")
.attribute("src", src())
.attribute("align", Pattern::any())
.attribute("alt", Pattern::any())
.attribute("width", Pattern::any())
.attribute("height", Pattern::any()),
)
.element(
relaxed_element("ins")
.attribute("cite", src())
.attribute("datetime", Pattern::any()),
)
.element(relaxed_element("kbd"))
.element(relaxed_element("li"))
.element(relaxed_element("mark"))
.element(
relaxed_element("ol")
.attribute("start", Pattern::any())
.attribute("reversed", Pattern::any())
.attribute("type", Pattern::any()),
)
.element(relaxed_element("p"))
.element(relaxed_element("pre"))
.element(relaxed_element("q").attribute("cite", src()))
.element(relaxed_element("rp"))
.element(relaxed_element("rt"))
.element(relaxed_element("ruby"))
.element(relaxed_element("s"))
.element(relaxed_element("samp"))
.element(relaxed_element("small"))
.element(relaxed_element("strike"))
.element(relaxed_element("strong"))
.element(relaxed_element("sub"))
.element(relaxed_element("sup"))
.element(
relaxed_element("table")
.attribute("summary", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("tbody"))
.element(
relaxed_element("td")
.attribute("abbr", Pattern::any())
.attribute("axis", Pattern::any())
.attribute("colspan", Pattern::any())
.attribute("rowspan", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("tfoot"))
.element(
relaxed_element("th")
.attribute("abbr", Pattern::any())
.attribute("axis", Pattern::any())
.attribute("colspan", Pattern::any())
.attribute("rowspan", Pattern::any())
.attribute("scope", Pattern::any())
.attribute("width", Pattern::any()),
)
.element(relaxed_element("thead"))
.element(
relaxed_element("time")
.attribute("datetime", Pattern::any())
.attribute("pubdate", Pattern::any()),
)
.element(relaxed_element("tr"))
.element(relaxed_element("u"))
.element(relaxed_element("ul").attribute("type", Pattern::any()))
.element(relaxed_element("var"))
.element(relaxed_element("wbr"))
.space("address")
.space("article")
.space("aside")
.space("footer")
.space("header")
.space("hr")
.space("nav")
.space("section")
}
/// Restricted rules. Allows only very simple inline markup. No links, images, or block elements.
pub fn restricted() -> Rules {
Rules::new()
.element(Element::new("b"))
.element(Element::new("em"))
.element(Element::new("i"))
.element(Element::new("strong"))
.element(Element::new("u"))
.space("address")
.space("article")
.space("aside")
.space("blockquote")
.space("br")
.space("dd")
.space("div")
.space("dl")
.space("dt")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("li")
.space("nav")
.space("ol")
.space("p")
.space("pre")
.space("section")
.space("ul")
}
/// Rules for document from untrusted sources. Removes all tags but text emphasizing and links.
pub fn untrusted() -> Rules {
Rules::new()
.element(
Element::new("a")
.attribute("href", href())
.mandatory_attribute("target", "_blank")
.mandatory_attribute("rel", "noreferrer noopener"),
)
.element(Element::new("b"))
.element(Element::new("em"))
.element(Element::new("i"))
.element(Element::new("strong"))
.element(Element::new("u"))
.space("address")
.space("article")
.space("aside")
.space("blockquote")
.space("br")
.space("dd")
.space("div")
.space("dl")
.space("dt")
.space("footer")
.space("h1")
.space("h2")
.space("h3")
.space("h4")
.space("h5")
.space("h6")
.space("header")
.space("hgroup")
.space("hr")
.space("li")
.space("nav")
.space("ol")
.space("p")
.space("pre")
.space("section")
.space("ul")
}
#[cfg(test)]
mod tests {
use super::{basic, default, relaxed, restricted, untrusted};
#[test]
fn basic_does_not_fail() {
let rules = basic();
assert_eq!(rules.allowed_elements.len(), 31);
}
#[test]
fn default_does_not_fail() {
let rules = default();
assert_eq!(rules.allowed_elements.len(), 0);
assert_eq!(rules.space_elements.len(), 26);
assert_eq!(rules.delete_elements.len(), 6);
}
#[test]
fn relaxed_does_not_fail() {
let rules = relaxed();
assert_eq!(rules.allowed_elements.len(), 58);
assert_eq!(rules.space_elements.len(), 8);
}
#[test]
fn restricted_does_not_fail() {
let rules = restricted();
assert_eq!(rules.allowed_elements.len(), 5);
assert_eq!(rules.space_elements.len(), 26);
}
#[test]
fn untrusted_does_not_fail() {
let rules = untrusted();
assert_eq!(rules.allowed_elements.len(), 6);
assert_eq!(rules.space_elements.len(), 26);
}
}

View File

@ -1,202 +0,0 @@
use crate::rules::{Element, Rules};
use html5ever::{interface::QualName, namespace_url, ns, LocalName};
use kuchiki::{Attribute, ElementData, ExpandedName, NodeData, NodeRef};
fn simple_qual_name(name: &str) -> QualName {
QualName::new(None, ns!(), LocalName::from(name))
}
fn qual_name_to_string(name: &QualName) -> String {
if name.ns == ns!(html) || name.ns.is_empty() {
name.local.to_lowercase()
} else {
format!("{}:{}", name.ns.to_lowercase(), name.local.to_lowercase())
}
}
fn expanded_name_to_string(name: &ExpandedName) -> String {
if name.ns == ns!(html) || name.ns.is_empty() {
name.local.to_lowercase()
} else {
format!("{}:{}", name.ns.to_lowercase(), name.local.to_lowercase())
}
}
fn simple_element(
name: QualName,
attrs: Vec<(ExpandedName, Attribute)>,
children: Vec<NodeRef>,
) -> NodeRef {
let element = NodeRef::new_element(name, attrs);
for child in children {
child.detach();
element.append(child);
}
element
}
fn create_space_text() -> NodeRef {
NodeRef::new_text(" ")
}
enum ElementAction<'t> {
Keep(&'t Element),
Delete,
Space,
Elide,
Rename(&'t str),
}
fn element_action<'t>(element_name: &QualName, rules: &'t Rules) -> ElementAction<'t> {
let name = qual_name_to_string(element_name);
if name == "html" || name == "body" {
ElementAction::Elide
} else if let Some(element_sanitizer) = rules.allowed_elements.get(&name) {
ElementAction::Keep(element_sanitizer)
} else if rules.delete_elements.contains(&name) {
ElementAction::Delete
} else if rules.space_elements.contains(&name) {
ElementAction::Space
} else if let Some(rename_to) = rules.rename_elements.get(&name) {
ElementAction::Rename(rename_to)
} else {
ElementAction::Elide
}
}
fn clean_nodes(nodes: impl IntoIterator<Item = NodeRef>, rules: &Rules) -> Vec<NodeRef> {
let mut result = Vec::new();
for node in nodes {
let subnodes = clean_node(&node, rules);
result.extend(subnodes);
}
result
}
fn clean_node(node: &NodeRef, rules: &Rules) -> Vec<NodeRef> {
match node.data() {
NodeData::Document(..) => vec![],
NodeData::DocumentFragment => vec![], // TODO: ??
NodeData::Doctype(..) => vec![],
NodeData::ProcessingInstruction(..) => vec![],
NodeData::Text(..) => vec![node.clone()],
NodeData::Comment(..) => {
if rules.allow_comments {
vec![node.clone()]
} else {
vec![]
}
}
NodeData::Element(ElementData {
ref name,
ref attributes,
..
}) => {
match element_action(name, rules) {
ElementAction::Keep(element_sanitizer) => {
let mut new_attrs: Vec<(ExpandedName, Attribute)> = Vec::new();
/* whitelisted attributes */
for (attr_name, attr_value) in attributes.borrow().map.iter() {
let expanded_name = expanded_name_to_string(attr_name);
let new_value = if !element_sanitizer.attribute_rules.modify_attributes.contains_key(&expanded_name) {
attr_value.clone()
} else {
let func = element_sanitizer.attribute_rules.modify_attributes.get(&expanded_name).unwrap();
let new_value = func(attr_value.value.clone());
Attribute {
prefix: attr_value.prefix.clone(),
value: new_value
}
};
if !element_sanitizer
.is_valid(&expanded_name_to_string(attr_name), &new_value.value)
{
continue;
}
let name = &attr_name.local.to_string();
let new_name = if element_sanitizer
.attribute_rules
.rename_attributes
.contains_key(name)
{
ExpandedName::new(
attr_name.ns.clone(),
String::from(
element_sanitizer
.attribute_rules
.rename_attributes
.get(name)
.unwrap(),
),
)
} else {
attr_name.clone()
};
new_attrs.push((new_name, attr_value.clone()));
}
/* mandatory attributes */
let mut mandatory_attributes: Vec<(&String, &String)> =
element_sanitizer.mandatory_attributes.iter().collect();
mandatory_attributes.sort();
for &(attr_name, attr_value) in mandatory_attributes.iter() {
new_attrs.push((
ExpandedName::new(ns!(), LocalName::from(attr_name.as_str())),
Attribute {
prefix: None,
value: attr_value.into(),
},
));
}
let children = clean_nodes(node.children(), rules);
let element = simple_element(name.clone(), new_attrs, children);
vec![element]
}
ElementAction::Delete => vec![],
ElementAction::Elide => clean_nodes(node.children(), rules),
ElementAction::Space => {
let mut nodes = clean_nodes(node.children(), rules);
if nodes.is_empty() {
nodes.push(create_space_text());
} else {
nodes.insert(0, create_space_text());
nodes.push(create_space_text());
}
nodes
}
ElementAction::Rename(rename_to) => {
let children = clean_nodes(node.children(), rules);
vec![simple_element(
simple_qual_name(rename_to),
Vec::new(),
children,
)]
}
}
}
}
}
pub(crate) fn sanitize_dom(dom: &NodeRef, mode: &Rules) -> NodeRef {
let new_children = clean_nodes(dom.children(), mode);
let new_dom = NodeRef::new_document();
for child in new_children {
child.detach();
new_dom.append(child);
}
new_dom
}

View File

@ -1,645 +0,0 @@
#![cfg(test)]
use super::rules::predefined::*;
use super::rules::{Element, Rules};
use super::sanitize_str;
#[test]
fn empty() {
assert_eq!(&sanitize_str(&BASIC, "").unwrap(), "");
assert_eq!(&sanitize_str(&DEFAULT, "").unwrap(), "");
assert_eq!(&sanitize_str(&RELAXED, "").unwrap(), "");
assert_eq!(&sanitize_str(&RESTRICTED, "").unwrap(), "");
assert_eq!(&sanitize_str(&UNTRUSTED, "").unwrap(), "");
}
/* basic */
const BASIC_HTML: &str = "<b>Lo<!-- comment -->rem</b> <a href=\"pants\" title=\"foo\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br/>amet <script>alert(\"hello world\");</script>";
#[test]
fn basic_default() {
assert_eq!(
&sanitize_str(&DEFAULT, BASIC_HTML).unwrap(),
"Lorem ipsum dolor sit amet "
);
}
#[test]
fn basic_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, BASIC_HTML).unwrap(),
"<b>Lorem</b> ipsum <strong>dolor</strong> sit amet alert(\"hello world\");"
);
}
#[test]
fn basic_basic() {
assert_eq!(
&sanitize_str(&BASIC, BASIC_HTML).unwrap(),
"<b>Lorem</b> <a href=\"pants\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br>amet alert(\"hello world\");"
);
}
#[test]
fn basic_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, BASIC_HTML).unwrap(),
"<b>Lorem</b> <a href=\"pants\" title=\"foo\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br>amet alert(\"hello world\");"
);
}
/* malformed */
const MALFORMED_HTML: &str = "Lo<!-- comment -->rem</b> <a href=pants title=\"foo>ipsum <a href=\"http://foo.com/\"><strong>dolor</a></strong> sit<br/>amet <script>alert(\"hello world\");";
#[test]
fn malformed_default() {
assert_eq!(
&sanitize_str(&DEFAULT, MALFORMED_HTML).unwrap(),
"Lorem dolor sit amet "
);
}
#[test]
fn malformed_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, MALFORMED_HTML).unwrap(),
"Lorem <strong>dolor</strong> sit amet alert(\"hello world\");"
);
}
#[test]
fn malformed_basic() {
assert_eq!(
&sanitize_str(&BASIC, MALFORMED_HTML).unwrap(),
"Lorem <a href=\"pants\"><strong>dolor</strong></a> sit<br>amet alert(\"hello world\");"
);
}
#[test]
fn malformed_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, MALFORMED_HTML).unwrap(),
"Lorem <a href=\"pants\" title=\"foo>ipsum <a href=\"><strong>dolor</strong></a> sit<br>amet alert(\"hello world\");"
);
}
/* unclosed */
const UNCLOSED_HTML: &str = "<p>a</p><blockquote>b";
#[test]
fn unclosed_default() {
assert_eq!(&sanitize_str(&DEFAULT, UNCLOSED_HTML).unwrap(), " a b ");
}
#[test]
fn unclosed_restricted() {
assert_eq!(&sanitize_str(&RESTRICTED, UNCLOSED_HTML).unwrap(), " a b ");
}
#[test]
fn unclosed_basic() {
assert_eq!(
&sanitize_str(&BASIC, UNCLOSED_HTML).unwrap(),
"<p>a</p><blockquote>b</blockquote>"
);
}
#[test]
fn unclosed_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, UNCLOSED_HTML).unwrap(),
"<p>a</p><blockquote>b</blockquote>"
);
}
/* malicious */
const MALICIOUS_HTML: &str = "<b>Lo<!-- comment -->rem</b> <a href=\"javascript:pants\" title=\"foo\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert(\"hello world\");</script>";
#[test]
fn malicious_default() {
assert_eq!(
&sanitize_str(&DEFAULT, MALICIOUS_HTML).unwrap(),
"Lorem ipsum dolor sit amet &lt;script&gt;alert(\"hello world\");"
);
}
#[test]
fn malicious_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, MALICIOUS_HTML).unwrap(),
"<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert(\"hello world\");"
);
}
#[test]
fn malicious_basic() {
assert_eq!(
&sanitize_str(&BASIC, MALICIOUS_HTML).unwrap(),
"<b>Lorem</b> <a>ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert(\"hello world\");"
);
}
#[test]
fn malicious_untrusted() {
assert_eq!(
&sanitize_str(&UNTRUSTED, MALICIOUS_HTML).unwrap(),
"<b>Lorem</b> <a rel=\"noreferrer noopener\" target=\"_blank\">ipsum</a> <a href=\"http://foo.com/\" rel=\"noreferrer noopener\" target=\"_blank\"><strong>dolor</strong></a> sit amet &lt;script&gt;alert(\"hello world\");"
);
}
#[test]
fn malicious_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, MALICIOUS_HTML).unwrap(),
"<b>Lorem</b> <a title=\"foo\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert(\"hello world\");"
);
}
/* raw-comment */
const RAW_COMMENT_HTML: &str = "<!-- comment -->Hello";
#[test]
fn raw_comment_default() {
assert_eq!(&sanitize_str(&DEFAULT, RAW_COMMENT_HTML).unwrap(), "Hello");
}
#[test]
fn raw_comment_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, RAW_COMMENT_HTML).unwrap(),
"Hello"
);
}
#[test]
fn raw_comment_basic() {
assert_eq!(&sanitize_str(&BASIC, RAW_COMMENT_HTML).unwrap(), "Hello");
}
#[test]
fn raw_comment_relaxed() {
assert_eq!(&sanitize_str(&RELAXED, RAW_COMMENT_HTML).unwrap(), "Hello");
}
/* protocol-based JS injection: simple, no spaces */
const JS_INJECTION_HTML_1: &str = "<a href=\"javascript:alert(\'XSS\');\">foo</a>";
#[test]
fn js_injection_1_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_1).unwrap(), "foo");
}
#[test]
fn js_injection_1_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_1).unwrap(),
"foo"
);
}
#[test]
fn js_injection_1_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_1).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_1_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_1).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: simple, spaces before */
const JS_INJECTION_HTML_2: &str = "<a href=\"javascript :alert(\'XSS\');\">foo</a>";
#[test]
fn js_injection_2_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_2).unwrap(), "foo");
}
#[test]
fn js_injection_2_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_2).unwrap(),
"foo"
);
}
#[test]
fn js_injection_2_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_2).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_2_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_2).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: simple, spaces after */
const JS_INJECTION_HTML_3: &str = "<a href=\"javascript: alert(\'XSS\');\">foo</a>";
#[test]
fn js_injection_3_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_3).unwrap(), "foo");
}
#[test]
fn js_injection_3_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_3).unwrap(),
"foo"
);
}
#[test]
fn js_injection_3_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_3).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_3_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_3).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: simple, spaces before and after */
const JS_INJECTION_HTML_4: &str = "<a href=\"javascript : alert(\'XSS\');\">foo</a>";
#[test]
fn js_injection_4_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_4).unwrap(), "foo");
}
#[test]
fn js_injection_4_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_4).unwrap(),
"foo"
);
}
#[test]
fn js_injection_4_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_4).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_4_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_4).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: preceding colon */
const JS_INJECTION_HTML_5: &str = "<a href=\":javascript:alert(\'XSS\');\">foo</a>";
#[test]
fn js_injection_5_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_5).unwrap(), "foo");
}
#[test]
fn js_injection_5_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_5).unwrap(),
"foo"
);
}
#[test]
fn js_injection_5_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_5).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_5_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_5).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: UTF-8 encoding */
const JS_INJECTION_HTML_6: &str = "<a href=\"javascript&#58;\">foo</a>";
#[test]
fn js_injection_6_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_6).unwrap(), "foo");
}
#[test]
fn js_injection_6_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_6).unwrap(),
"foo"
);
}
#[test]
fn js_injection_6_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_6).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_6_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_6).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: long UTF-8 encoding */
const JS_INJECTION_HTML_7: &str = "<a href=\"javascript&#0058;\">foo</a>";
#[test]
fn js_injection_7_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_7).unwrap(), "foo");
}
#[test]
fn js_injection_7_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_7).unwrap(),
"foo"
);
}
#[test]
fn js_injection_7_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_7).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_7_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_7).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: long UTF-8 encoding without semicolons */
const JS_INJECTION_HTML_8: &str = "<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>";
#[test]
fn js_injection_8_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_8).unwrap(), "foo");
}
#[test]
fn js_injection_8_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_8).unwrap(),
"foo"
);
}
#[test]
fn js_injection_8_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_8).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_8_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_8).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: hex encoding */
const JS_INJECTION_HTML_9: &str = "<a href=\"javascript&#x3A;\">foo</a>";
#[test]
fn js_injection_9_default() {
assert_eq!(&sanitize_str(&DEFAULT, JS_INJECTION_HTML_9).unwrap(), "foo");
}
#[test]
fn js_injection_9_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_9).unwrap(),
"foo"
);
}
#[test]
fn js_injection_9_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_9).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_9_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_9).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: long hex encoding */
const JS_INJECTION_HTML_10: &str = "<a href=\"javascript&#x003A;\">foo</a>";
#[test]
fn js_injection_10_default() {
assert_eq!(
&sanitize_str(&DEFAULT, JS_INJECTION_HTML_10).unwrap(),
"foo"
);
}
#[test]
fn js_injection_10_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_10).unwrap(),
"foo"
);
}
#[test]
fn js_injection_10_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_10).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_10_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_10).unwrap(),
"<a>foo</a>"
);
}
/* protocol-based JS injection: hex encoding without semicolons */
const JS_INJECTION_HTML_11: &str = "<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>";
#[test]
fn js_injection_11_default() {
assert_eq!(
&sanitize_str(&DEFAULT, JS_INJECTION_HTML_11).unwrap(),
"foo"
);
}
#[test]
fn js_injection_11_restricted() {
assert_eq!(
&sanitize_str(&RESTRICTED, JS_INJECTION_HTML_11).unwrap(),
"foo"
);
}
#[test]
fn js_injection_11_basic() {
assert_eq!(
&sanitize_str(&BASIC, JS_INJECTION_HTML_11).unwrap(),
"<a>foo</a>"
);
}
#[test]
fn js_injection_11_relaxed() {
assert_eq!(
&sanitize_str(&RELAXED, JS_INJECTION_HTML_11).unwrap(),
"<a>foo</a>"
);
}
/* should translate valid HTML entities */
#[test]
fn misc_1() {
assert_eq!(
&sanitize_str(&DEFAULT, "Don&apos;t tas&eacute; me &amp; bro!").unwrap(),
"Don't tasé me &amp; bro!"
);
}
/* should translate valid HTML entities while encoding unencoded ampersands */
#[test]
fn misc_2() {
assert_eq!(
&sanitize_str(&DEFAULT, "cookies&sup2; & &frac14; cr&eacute;me").unwrap(),
"cookies² &amp; ¼ créme"
);
}
/* should never output &apos; */
#[test]
fn misc_3() {
assert_eq!(
&sanitize_str(
&DEFAULT,
"<a href='&apos;' class=\"' &#39;\">IE6 isn't a real browser</a>"
)
.unwrap(),
"IE6 isn't a real browser"
);
}
/* should not choke on several instances of the same element in a row */
#[test]
fn misc_4() {
assert_eq!(
&sanitize_str(&DEFAULT, "<img src=\"http://www.google.com/intl/en_ALL/images/logo.gif\"><img src=\"http://www.google.com/intl/en_ALL/images/logo.gif\"><img src=\"http://www.google.com/intl/en_ALL/images/logo.gif\"><img src=\"http://www.google.com/intl/en_ALL/images/logo.gif\">").unwrap(),
""
);
}
/* should surround the contents of :whitespace_elements with space characters when removing the element */
#[test]
fn misc_5() {
assert_eq!(
&sanitize_str(&DEFAULT, "foo<div>bar</div>baz").unwrap(),
"foo bar baz"
);
}
#[test]
fn misc_6() {
assert_eq!(
&sanitize_str(&DEFAULT, "foo<br>bar<br>baz").unwrap(),
"foo bar baz"
);
}
#[test]
fn misc_7() {
assert_eq!(
&sanitize_str(&DEFAULT, "foo<hr>bar<hr>baz").unwrap(),
"foo bar baz"
);
}
#[test]
fn custom_rules() {
let rules = Rules::new()
.allow_comments(true)
.element(Element::new("b"))
.element(Element::new("span"))
.delete("script")
.delete("style")
.space("br")
.rename("strong", "span");
let html = "<b>Lo<!-- comment -->rem</b> <a href=\"javascript:pants\" title=\"foo\">ipsum</a> <a href=\"http://foo.com/\"><strong>dolor</strong></a> sit<br/>amet <script>alert(\"hello world\")</script>";
assert_eq!(
&sanitize_str(&rules, html).unwrap(),
"<b>Lo<!-- comment -->rem</b> ipsum <span>dolor</span> sit amet "
);
}