Add fork of kuchiki

Signed-off-by: Jacob Kiers <jacob@jacobkiers.net>
This commit is contained in:
Jacob Kiers 2022-06-10 14:37:41 +02:00
parent 4e3f7b46da
commit ecb435bbc4
22 changed files with 2407 additions and 0 deletions

View File

@ -3,4 +3,5 @@
members = [
"bin",
"sanitize-html-rs",
"kuchiki",
]

3
kuchiki/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
target
Cargo.lock
.cargo/config

6
kuchiki/.travis.yml Normal file
View File

@ -0,0 +1,6 @@
sudo: false
language: rust
rust:
- nightly
- beta
- stable

22
kuchiki/Cargo.toml Normal file
View File

@ -0,0 +1,22 @@
[package]
name = "kuchiki"
version = "0.8.1"
authors = ["Simon Sapin <simon.sapin@exyr.org>"]
license = "MIT"
description = "(朽木) HTML/XML tree manipulation library"
repository = "https://github.com/kuchiki-rs/kuchiki"
edition = "2018"
[lib]
name = "kuchiki"
doctest = false
[dependencies]
cssparser = "0.27"
matches = "0.1.4"
html5ever = "0.25"
selectors = "0.22"
indexmap = "1.6.0"
[dev-dependencies]
tempfile = "3"

23
kuchiki/LICENSE Normal file
View File

@ -0,0 +1,23 @@
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

10
kuchiki/README.md Normal file
View File

@ -0,0 +1,10 @@
Kuchiki (朽木)
==============
HTML/XML¹ tree manipulation library for Rust.
[Documentation](https://docs.rs/kuchiki/)
See [users.rust-lang.org discussion](http://users.rust-lang.org/t/kuchiki-a-vaporware-html-xml-tree-manipulation-library/435).
¹ There is no support for XML syntax yet. The plan is to integrate with an existing parser.

0
kuchiki/docs/.nojekyll Normal file
View File

3
kuchiki/docs/404.html Normal file
View File

@ -0,0 +1,3 @@
<meta http-equiv="refresh" content="0; url=https://docs.rs/kuchiki/">
<link rel="canonical" href="https://docs.rs/kuchiki/">
<a href="https://docs.rs/kuchiki/">Moved to docs.rs</a>

3
kuchiki/docs/index.html Normal file
View File

@ -0,0 +1,3 @@
<meta http-equiv="refresh" content="0; url=https://docs.rs/kuchiki/">
<link rel="canonical" href="https://docs.rs/kuchiki/">
<a href="https://docs.rs/kuchiki/">Moved to docs.rs</a>

View File

@ -0,0 +1,48 @@
extern crate kuchiki;
use kuchiki::traits::*;
fn main() {
let html = r"
<DOCTYPE html>
<html>
<head></head>
<body>
<h1>Example</h1>
<p class='foo'>Hello, world!</p>
<p class='foo'>I love HTML</p>
</body>
</html>
";
let css_selector = ".foo";
let document = kuchiki::parse_html().one(html);
for css_match in document.select(css_selector).unwrap() {
// css_match is a NodeDataRef, but most of the interesting methods are
// on NodeRef. Let's get the underlying NodeRef.
let as_node = css_match.as_node();
// In this example, as_node represents an HTML node like
//
// <p class='foo'>Hello world!</p>"
//
// Which is distinct from just 'Hello world!'. To get rid of that <p>
// tag, we're going to get each element's first child, which will be
// a "text" node.
//
// There are other kinds of nodes, of course. The possibilities are all
// listed in the `NodeData` enum in this crate.
let text_node = as_node.first_child().unwrap();
// Let's get the actual text in this text node. A text node wraps around
// a RefCell<String>, so we need to call borrow() to get a &str out.
let text = text_node.as_text().unwrap().borrow();
// Prints:
//
// "Hello, world!"
// "I love HTML"
println!("{:?}", text);
}
}

View File

@ -0,0 +1,22 @@
extern crate kuchiki;
fn main() {
let mut depth = 2;
// 20 M nodes is a few GB of memory.
while depth <= 20_000_000 {
let mut node = kuchiki::NodeRef::new_text("");
for _ in 0..depth {
let parent = kuchiki::NodeRef::new_text("");
parent.append(node);
node = parent;
}
println!("Trying to drop {} nodes...", depth);
// Without an explicit `impl Drop for Node`,
// depth = 20_000 causes "thread '<main>' has overflowed its stack"
// on my machine (Linux x86_64).
::std::mem::drop(node);
depth *= 10;
}
}

83
kuchiki/src/attributes.rs Normal file
View File

@ -0,0 +1,83 @@
use html5ever::{LocalName, Namespace, Prefix};
use indexmap::{map::Entry, IndexMap};
/// Convenience wrapper around a indexmap that adds method for attributes in the null namespace.
#[derive(Debug, PartialEq, Clone)]
pub struct Attributes {
/// A map of attributes whose name can have namespaces.
pub map: IndexMap<ExpandedName, Attribute>,
}
/// <https://www.w3.org/TR/REC-xml-names/#dt-expname>
#[derive(Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord)]
pub struct ExpandedName {
/// Namespace URL
pub ns: Namespace,
/// "Local" part of the name
pub local: LocalName,
}
impl ExpandedName {
/// Trivial constructor
pub fn new<N: Into<Namespace>, L: Into<LocalName>>(ns: N, local: L) -> Self {
ExpandedName {
ns: ns.into(),
local: local.into(),
}
}
}
/// The non-identifying parts of an attribute
#[derive(Debug, PartialEq, Clone)]
pub struct Attribute {
/// The namespace prefix, if any
pub prefix: Option<Prefix>,
/// The attribute value
pub value: String,
}
impl Attributes {
/// Like IndexMap::contains
pub fn contains<A: Into<LocalName>>(&self, local_name: A) -> bool {
self.map.contains_key(&ExpandedName::new(ns!(), local_name))
}
/// Like IndexMap::get
pub fn get<A: Into<LocalName>>(&self, local_name: A) -> Option<&str> {
self.map
.get(&ExpandedName::new(ns!(), local_name))
.map(|attr| &*attr.value)
}
/// Like IndexMap::get_mut
pub fn get_mut<A: Into<LocalName>>(&mut self, local_name: A) -> Option<&mut String> {
self.map
.get_mut(&ExpandedName::new(ns!(), local_name))
.map(|attr| &mut attr.value)
}
/// Like IndexMap::entry
pub fn entry<A: Into<LocalName>>(&mut self, local_name: A) -> Entry<ExpandedName, Attribute> {
self.map.entry(ExpandedName::new(ns!(), local_name))
}
/// Like IndexMap::insert
pub fn insert<A: Into<LocalName>>(
&mut self,
local_name: A,
value: String,
) -> Option<Attribute> {
self.map.insert(
ExpandedName::new(ns!(), local_name),
Attribute {
prefix: None,
value,
},
)
}
/// Like IndexMap::remove
pub fn remove<A: Into<LocalName>>(&mut self, local_name: A) -> Option<Attribute> {
self.map.remove(&ExpandedName::new(ns!(), local_name))
}
}

113
kuchiki/src/cell_extras.rs Normal file
View File

@ -0,0 +1,113 @@
//! Specialized methods for `Cell` of some specific `!Copy` types,
//! allowing limited access to a value without moving it of the cell.
//!
//!
//! # Soundness
//!
//! These methods use and `Cell::as_ptr` and `unsafe`.
//! Their soundness lies in that:
//!
//! * `Cell<T>: !Sync` for any `T`, so no other thread is accessing this cell.
//! * For the duration of the raw pointer access,
//! this thread only runs code that is known to not access the same cell again.
//! In particular, no method of a type paramater is called.
//! For example, `clone_inner` would be unsound to generalize to any `Cell<T>`
//! because it would involve running arbitrary code through `T::clone`
//! and provide that code with a reference to the inside of the cell.
//!
//! ```rust
//! struct Evil(Box<u32>, Rc<Cell<Option<Evil>>>);
//! impl Clone for Evil {
//! fn clone(&self) -> Self {
//! mem::drop(self.1.take()); // Mess with the "other" node, which might be `self`.
//! Evil(
//! self.0.clone(), // possible use after free!
//! Rc::new(Cell::new(None))
//! )
//! }
//! }
//! let a = Rc::new(Cell::new(None));
//! a.set(Some(Evil(Box::new(5), a.clone()))); // Make a reference cycle.
//! a.clone_inner();
//! ```
//!
//! `Rc<T>::clone` and `Weak<T>::clone` do not have this problem
//! as they only increment reference counts and never call `T::clone`.
//!
//!
//! # Alternative
//!
//! To avoid using `unsafe` entirely, operating on a `T: !Copy` value inside a `Cell<T>`
//! would require temporarily replacing it with a default value:
//!
//! ```rust
//! fn option_dance<T, F, R>(cell: &Cell<T>, f: F) -> R
//! where T: Default, F: FnOnce(&mut T) -> R
//! {
//! let mut value = cell.take();
//! let result = f(&mut value);
//! cell.set(value);
//! result
//! }
//! ```
//!
//! It would be worth exploring whether LLVM can reliably optimize away these extra moves
//! and compile the `Option` dance to assembly similar to that of the `unsafe` operation.
use std::cell::Cell;
use std::rc::{Rc, Weak};
pub trait CellOption {
fn is_none(&self) -> bool;
}
impl<T> CellOption for Cell<Option<T>> {
#[inline]
fn is_none(&self) -> bool {
unsafe { (*self.as_ptr()).is_none() }
}
}
pub trait CellOptionWeak<T> {
fn upgrade(&self) -> Option<Rc<T>>;
fn clone_inner(&self) -> Option<Weak<T>>;
}
impl<T> CellOptionWeak<T> for Cell<Option<Weak<T>>> {
#[inline]
fn upgrade(&self) -> Option<Rc<T>> {
unsafe { (*self.as_ptr()).as_ref().and_then(Weak::upgrade) }
}
#[inline]
fn clone_inner(&self) -> Option<Weak<T>> {
unsafe { (*self.as_ptr()).clone() }
}
}
pub trait CellOptionRc<T> {
/// Return `Some` if this `Rc` is the only strong reference count,
/// even if there are weak references.
fn take_if_unique_strong(&self) -> Option<Rc<T>>;
fn clone_inner(&self) -> Option<Rc<T>>;
}
impl<T> CellOptionRc<T> for Cell<Option<Rc<T>>> {
#[inline]
fn take_if_unique_strong(&self) -> Option<Rc<T>> {
unsafe {
match *self.as_ptr() {
None => None,
Some(ref rc) if Rc::strong_count(rc) > 1 => None,
// Not borrowing the `Rc<T>` here
// as we would be invalidating that borrow while it is outstanding:
Some(_) => self.take(),
}
}
}
#[inline]
fn clone_inner(&self) -> Option<Rc<T>> {
unsafe { (*self.as_ptr()).clone() }
}
}

452
kuchiki/src/iter.rs Normal file
View File

@ -0,0 +1,452 @@
//! Node iterators
use std::borrow::Borrow;
use std::cell::RefCell;
use std::iter::Rev;
use crate::node_data_ref::NodeDataRef;
use crate::select::Selectors;
use crate::tree::{ElementData, NodeRef};
impl NodeRef {
/// Return an iterator of references to this node and its ancestors.
#[inline]
pub fn inclusive_ancestors(&self) -> Ancestors {
Ancestors(Some(self.clone()))
}
/// Return an iterator of references to this nodes ancestors.
#[inline]
pub fn ancestors(&self) -> Ancestors {
Ancestors(self.parent())
}
/// Return an iterator of references to this node and the siblings before it.
#[inline]
pub fn inclusive_preceding_siblings(&self) -> Rev<Siblings> {
match self.parent() {
Some(parent) => {
let first_sibling = parent.first_child().unwrap();
debug_assert!(self.previous_sibling().is_some() || *self == first_sibling);
Siblings(Some(State {
next: first_sibling,
next_back: self.clone(),
}))
}
None => {
debug_assert!(self.previous_sibling().is_none());
Siblings(Some(State {
next: self.clone(),
next_back: self.clone(),
}))
}
}
.rev()
}
/// Return an iterator of references to this nodes siblings before it.
#[inline]
pub fn preceding_siblings(&self) -> Rev<Siblings> {
match (self.parent(), self.previous_sibling()) {
(Some(parent), Some(previous_sibling)) => {
let first_sibling = parent.first_child().unwrap();
Siblings(Some(State {
next: first_sibling,
next_back: previous_sibling,
}))
}
_ => Siblings(None),
}
.rev()
}
/// Return an iterator of references to this node and the siblings after it.
#[inline]
pub fn inclusive_following_siblings(&self) -> Siblings {
match self.parent() {
Some(parent) => {
let last_sibling = parent.last_child().unwrap();
debug_assert!(self.next_sibling().is_some() || *self == last_sibling);
Siblings(Some(State {
next: self.clone(),
next_back: last_sibling,
}))
}
None => {
debug_assert!(self.next_sibling().is_none());
Siblings(Some(State {
next: self.clone(),
next_back: self.clone(),
}))
}
}
}
/// Return an iterator of references to this nodes siblings after it.
#[inline]
pub fn following_siblings(&self) -> Siblings {
match (self.parent(), self.next_sibling()) {
(Some(parent), Some(next_sibling)) => {
let last_sibling = parent.last_child().unwrap();
Siblings(Some(State {
next: next_sibling,
next_back: last_sibling,
}))
}
_ => Siblings(None),
}
}
/// Return an iterator of references to this nodes children.
#[inline]
pub fn children(&self) -> Siblings {
match (self.first_child(), self.last_child()) {
(Some(first_child), Some(last_child)) => Siblings(Some(State {
next: first_child,
next_back: last_child,
})),
(None, None) => Siblings(None),
_ => unreachable!(),
}
}
/// Return an iterator of references to this node and its descendants, in tree order.
///
/// Parent nodes appear before the descendants.
///
/// Note: this is the `NodeEdge::Start` items from `traverse()`.
#[inline]
pub fn inclusive_descendants(&self) -> Descendants {
Descendants(self.traverse_inclusive())
}
/// Return an iterator of references to this nodes descendants, in tree order.
///
/// Parent nodes appear before the descendants.
///
/// Note: this is the `NodeEdge::Start` items from `traverse()`.
#[inline]
pub fn descendants(&self) -> Descendants {
Descendants(self.traverse())
}
/// Return an iterator of the start and end edges of this node and its descendants,
/// in tree order.
#[inline]
pub fn traverse_inclusive(&self) -> Traverse {
Traverse(Some(State {
next: NodeEdge::Start(self.clone()),
next_back: NodeEdge::End(self.clone()),
}))
}
/// Return an iterator of the start and end edges of this nodes descendants,
/// in tree order.
#[inline]
pub fn traverse(&self) -> Traverse {
match (self.first_child(), self.last_child()) {
(Some(first_child), Some(last_child)) => Traverse(Some(State {
next: NodeEdge::Start(first_child),
next_back: NodeEdge::End(last_child),
})),
(None, None) => Traverse(None),
_ => unreachable!(),
}
}
/// Return an iterator of the inclusive descendants element that match the given selector list.
#[inline]
pub fn select(&self, selectors: &str) -> Result<Select<Elements<Descendants>>, ()> {
self.inclusive_descendants().select(selectors)
}
/// Return the first inclusive descendants element that match the given selector list.
#[inline]
pub fn select_first(&self, selectors: &str) -> Result<NodeDataRef<ElementData>, ()> {
let mut elements = self.select(selectors)?;
elements.next().ok_or(())
}
}
#[derive(Debug, Clone)]
struct State<T> {
next: T,
next_back: T,
}
/// A double-ended iterator of sibling nodes.
#[derive(Debug, Clone)]
pub struct Siblings(Option<State<NodeRef>>);
macro_rules! siblings_next {
($next: ident, $next_back: ident, $next_sibling: ident) => {
fn $next(&mut self) -> Option<NodeRef> {
#![allow(non_shorthand_field_patterns)]
self.0.take().map(|State { $next: next, $next_back: next_back }| {
if let Some(sibling) = next.$next_sibling() {
if next != next_back {
self.0 = Some(State { $next: sibling, $next_back: next_back })
}
}
next
})
}
}
}
impl Iterator for Siblings {
type Item = NodeRef;
siblings_next!(next, next_back, next_sibling);
}
impl DoubleEndedIterator for Siblings {
siblings_next!(next_back, next, previous_sibling);
}
/// An iterator on ancestor nodes.
#[derive(Debug, Clone)]
pub struct Ancestors(Option<NodeRef>);
impl Iterator for Ancestors {
type Item = NodeRef;
#[inline]
fn next(&mut self) -> Option<NodeRef> {
self.0.take().map(|node| {
self.0 = node.parent();
node
})
}
}
/// An iterator of references to a given node and its descendants, in tree order.
#[derive(Debug, Clone)]
pub struct Descendants(Traverse);
macro_rules! descendants_next {
($next: ident) => {
#[inline]
fn $next(&mut self) -> Option<NodeRef> {
loop {
match (self.0).$next() {
Some(NodeEdge::Start(node)) => return Some(node),
Some(NodeEdge::End(_)) => {}
None => return None
}
}
}
}
}
impl Iterator for Descendants {
type Item = NodeRef;
descendants_next!(next);
}
impl DoubleEndedIterator for Descendants {
descendants_next!(next_back);
}
/// Marks either the start or the end of a node.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum NodeEdge<T> {
/// Indicates that start of a node that has children.
/// Yielded by `Traverse::next` before the nodes descendants.
/// In HTML or XML, this corresponds to an opening tag like `<div>`
Start(T),
/// Indicates that end of a node that has children.
/// Yielded by `Traverse::next` after the nodes descendants.
/// In HTML or XML, this corresponds to a closing tag like `</div>`
End(T),
}
/// An iterator of the start and end edges of the nodes in a given subtree.
#[derive(Debug, Clone)]
pub struct Traverse(Option<State<NodeEdge<NodeRef>>>);
macro_rules! traverse_next {
($next: ident, $next_back: ident, $first_child: ident, $next_sibling: ident, $Start: ident, $End: ident) => {
fn $next(&mut self) -> Option<NodeEdge<NodeRef>> {
#![allow(non_shorthand_field_patterns)]
self.0.take().map(|State { $next: next, $next_back: next_back }| {
if next != next_back {
self.0 = match next {
NodeEdge::$Start(ref node) => {
match node.$first_child() {
Some(child) => {
Some(State { $next: NodeEdge::$Start(child), $next_back: next_back })
}
None => Some(State { $next: NodeEdge::$End(node.clone()), $next_back: next_back })
}
}
NodeEdge::$End(ref node) => {
match node.$next_sibling() {
Some(sibling) => {
Some(State { $next: NodeEdge::$Start(sibling), $next_back: next_back })
}
None => node.parent().map(|parent| {
State { $next: NodeEdge::$End(parent), $next_back: next_back }
})
}
}
};
}
next
})
}
}
}
impl Iterator for Traverse {
type Item = NodeEdge<NodeRef>;
traverse_next!(next, next_back, first_child, next_sibling, Start, End);
}
impl DoubleEndedIterator for Traverse {
traverse_next!(next_back, next, last_child, previous_sibling, End, Start);
}
macro_rules! filter_map_like_iterator {
(#[$doc: meta] $name: ident: $f: expr, $from: ty => $to: ty) => {
#[$doc]
#[derive(Debug, Clone)]
pub struct $name<I>(pub I);
impl<I> Iterator for $name<I>
where
I: Iterator<Item = $from>,
{
type Item = $to;
#[inline]
fn next(&mut self) -> Option<$to> {
for x in self.0.by_ref() {
if let Some(y) = ($f)(x) {
return Some(y);
}
}
None
}
}
impl<I> DoubleEndedIterator for $name<I>
where
I: DoubleEndedIterator<Item = $from>,
{
#[inline]
fn next_back(&mut self) -> Option<$to> {
for x in self.0.by_ref().rev() {
if let Some(y) = ($f)(x) {
return Some(y);
}
}
None
}
}
};
}
filter_map_like_iterator! {
/// A node iterator adaptor that yields element nodes.
Elements: NodeRef::into_element_ref, NodeRef => NodeDataRef<ElementData>
}
filter_map_like_iterator! {
/// A node iterator adaptor that yields comment nodes.
Comments: NodeRef::into_comment_ref, NodeRef => NodeDataRef<RefCell<String>>
}
filter_map_like_iterator! {
/// A node iterator adaptor that yields text nodes.
TextNodes: NodeRef::into_text_ref, NodeRef => NodeDataRef<RefCell<String>>
}
/// An element iterator adaptor that yields elements maching given selectors.
pub struct Select<I, S = Selectors>
where
I: Iterator<Item = NodeDataRef<ElementData>>,
S: Borrow<Selectors>,
{
/// The underlying iterator.
pub iter: I,
/// The selectors to be matched.
pub selectors: S,
}
impl<I, S> Iterator for Select<I, S>
where
I: Iterator<Item = NodeDataRef<ElementData>>,
S: Borrow<Selectors>,
{
type Item = NodeDataRef<ElementData>;
#[inline]
fn next(&mut self) -> Option<NodeDataRef<ElementData>> {
for element in self.iter.by_ref() {
if self.selectors.borrow().matches(&element) {
return Some(element);
}
}
None
}
}
impl<I, S> DoubleEndedIterator for Select<I, S>
where
I: DoubleEndedIterator<Item = NodeDataRef<ElementData>>,
S: Borrow<Selectors>,
{
#[inline]
fn next_back(&mut self) -> Option<NodeDataRef<ElementData>> {
for element in self.iter.by_ref().rev() {
if self.selectors.borrow().matches(&element) {
return Some(element);
}
}
None
}
}
/// Convenience methods for node iterators.
pub trait NodeIterator: Sized + Iterator<Item = NodeRef> {
/// Filter this element iterator to elements.
#[inline]
fn elements(self) -> Elements<Self> {
Elements(self)
}
/// Filter this node iterator to text nodes.
#[inline]
fn text_nodes(self) -> TextNodes<Self> {
TextNodes(self)
}
/// Filter this node iterator to comment nodes.
#[inline]
fn comments(self) -> Comments<Self> {
Comments(self)
}
/// Filter this node iterator to elements maching the given selectors.
#[inline]
fn select(self, selectors: &str) -> Result<Select<Elements<Self>>, ()> {
self.elements().select(selectors)
}
}
/// Convenience methods for element iterators.
pub trait ElementIterator: Sized + Iterator<Item = NodeDataRef<ElementData>> {
/// Filter this element iterator to elements maching the given selectors.
#[inline]
fn select(self, selectors: &str) -> Result<Select<Self>, ()> {
Selectors::compile(selectors).map(|s| Select {
iter: self,
selectors: s,
})
}
}
impl<I> NodeIterator for I where I: Iterator<Item = NodeRef> {}
impl<I> ElementIterator for I where I: Iterator<Item = NodeDataRef<ElementData>> {}

40
kuchiki/src/lib.rs Normal file
View File

@ -0,0 +1,40 @@
/*!
Kuchiki (), a HTML/XML tree manipulation library for Rust.
*/
#![deny(missing_docs)]
#[macro_use]
extern crate html5ever;
#[macro_use]
extern crate matches;
mod attributes;
mod cell_extras;
pub mod iter;
mod node_data_ref;
mod parser;
mod select;
mod serializer;
#[cfg(test)]
mod tests;
mod tree;
pub use attributes::{Attribute, Attributes, ExpandedName};
pub use node_data_ref::NodeDataRef;
pub use parser::{parse_html, parse_html_with_options, parse_fragment, ParseOpts, Sink};
pub use select::{Selector, Selectors, Specificity};
pub use tree::{Doctype, DocumentData, ElementData, Node, NodeData, NodeRef};
/// This module re-exports a number of traits that are useful when using Kuchiki.
/// It can be used with:
///
/// ```rust
/// use kuchiki::traits::*;
/// ```
pub mod traits {
pub use html5ever::tendril::TendrilSink;
pub use crate::iter::{ElementIterator, NodeIterator};
}

View File

@ -0,0 +1,116 @@
use std::cell::RefCell;
use std::fmt;
use std::ops::Deref;
use crate::tree::{Doctype, DocumentData, ElementData, Node, NodeRef};
impl NodeRef {
/// If this node is an element, return a strong reference to element-specific data.
#[inline]
pub fn into_element_ref(self) -> Option<NodeDataRef<ElementData>> {
NodeDataRef::new_opt(self, Node::as_element)
}
/// If this node is a text node, return a strong reference to its contents.
#[inline]
pub fn into_text_ref(self) -> Option<NodeDataRef<RefCell<String>>> {
NodeDataRef::new_opt(self, Node::as_text)
}
/// If this node is a comment, return a strong reference to its contents.
#[inline]
pub fn into_comment_ref(self) -> Option<NodeDataRef<RefCell<String>>> {
NodeDataRef::new_opt(self, Node::as_comment)
}
/// If this node is a doctype, return a strong reference to doctype-specific data.
#[inline]
pub fn into_doctype_ref(self) -> Option<NodeDataRef<Doctype>> {
NodeDataRef::new_opt(self, Node::as_doctype)
}
/// If this node is a document, return a strong reference to document-specific data.
#[inline]
pub fn into_document_ref(self) -> Option<NodeDataRef<DocumentData>> {
NodeDataRef::new_opt(self, Node::as_document)
}
}
/// Holds a strong reference to a node, but dereferences to some component inside of it.
#[derive(Eq)]
pub struct NodeDataRef<T> {
_keep_alive: NodeRef,
_reference: *const T,
}
impl<T> NodeDataRef<T> {
/// Create a `NodeDataRef` for a component in a given node.
#[inline]
pub fn new<F>(rc: NodeRef, f: F) -> NodeDataRef<T>
where
F: FnOnce(&Node) -> &T,
{
NodeDataRef {
_reference: f(&*rc),
_keep_alive: rc,
}
}
/// Create a `NodeDataRef` for and a component that may or may not be in a given node.
#[inline]
pub fn new_opt<F>(rc: NodeRef, f: F) -> Option<NodeDataRef<T>>
where
F: FnOnce(&Node) -> Option<&T>,
{
f(&*rc).map(|r| r as *const T).map(move |r| NodeDataRef {
_reference: r,
_keep_alive: rc,
})
}
/// Access the corresponding node.
#[inline]
pub fn as_node(&self) -> &NodeRef {
&self._keep_alive
}
}
impl<T> Deref for NodeDataRef<T> {
type Target = T;
#[inline]
fn deref(&self) -> &T {
unsafe { &*self._reference }
}
}
// #[derive(PartialEq)] would compare both fields
impl<T> PartialEq for NodeDataRef<T> {
#[inline]
fn eq(&self, other: &Self) -> bool {
self._keep_alive == other._keep_alive
}
}
// #[derive(Clone)] would have an unnecessary `T: Clone` bound
impl<T> Clone for NodeDataRef<T> {
#[inline]
fn clone(&self) -> Self {
NodeDataRef {
_keep_alive: self._keep_alive.clone(),
_reference: self._reference,
}
}
}
impl<T: fmt::Debug> fmt::Debug for NodeDataRef<T> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
fmt::Debug::fmt(&**self, f)
}
}
impl NodeDataRef<ElementData> {
/// Return the concatenation of all text nodes in this subtree.
pub fn text_contents(&self) -> String {
self.as_node().text_contents()
}
}

241
kuchiki/src/parser.rs Normal file
View File

@ -0,0 +1,241 @@
use html5ever::tendril::StrTendril;
use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use html5ever::{self, Attribute, ExpandedName, QualName};
use std::borrow::Cow;
use crate::attributes;
use crate::tree::NodeRef;
/// Options for the HTML parser.
#[derive(Default)]
pub struct ParseOpts {
/// Options for the HTML tokenizer.
pub tokenizer: html5ever::tokenizer::TokenizerOpts,
/// Options for the HTML tree builder.
pub tree_builder: html5ever::tree_builder::TreeBuilderOpts,
/// A callback for HTML parse errors (which are never fatal).
pub on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
}
/// Parse an HTML document with html5ever and the default configuration.
pub fn parse_html() -> html5ever::Parser<Sink> {
parse_html_with_options(ParseOpts::default())
}
/// Parse an HTML document with html5ever with custom configuration.
pub fn parse_html_with_options(opts: ParseOpts) -> html5ever::Parser<Sink> {
let sink = Sink {
document_node: NodeRef::new_document(),
on_parse_error: opts.on_parse_error,
};
let html5opts = html5ever::ParseOpts {
tokenizer: opts.tokenizer,
tree_builder: opts.tree_builder,
};
html5ever::parse_document(sink, html5opts)
}
/// Parse an HTML fragment with html5ever and the default configuration.
pub fn parse_fragment(ctx_name: QualName, ctx_attr: Vec<Attribute>) -> html5ever::Parser<Sink> {
parse_fragment_with_options(ParseOpts::default(), ctx_name, ctx_attr)
}
/// Parse an HTML fragment with html5ever with custom configuration.
pub fn parse_fragment_with_options(opts: ParseOpts, ctx_name: QualName, ctx_attr: Vec<Attribute>) -> html5ever::Parser<Sink> {
let sink = Sink {
document_node: NodeRef::new_document(),
on_parse_error: opts.on_parse_error,
};
let html5opts = html5ever::ParseOpts {
tokenizer: opts.tokenizer,
tree_builder: opts.tree_builder,
};
html5ever::parse_fragment(sink, html5opts, ctx_name, ctx_attr)
}
/// Receives new tree nodes during parsing.
pub struct Sink {
document_node: NodeRef,
on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
}
impl TreeSink for Sink {
type Output = NodeRef;
fn finish(self) -> NodeRef {
self.document_node
}
type Handle = NodeRef;
#[inline]
fn parse_error(&mut self, message: Cow<'static, str>) {
if let Some(ref mut handler) = self.on_parse_error {
handler(message)
}
}
#[inline]
fn get_document(&mut self) -> NodeRef {
self.document_node.clone()
}
#[inline]
fn set_quirks_mode(&mut self, mode: QuirksMode) {
self.document_node
.as_document()
.unwrap()
._quirks_mode
.set(mode)
}
#[inline]
fn same_node(&self, x: &NodeRef, y: &NodeRef) -> bool {
x == y
}
#[inline]
fn elem_name<'a>(&self, target: &'a NodeRef) -> ExpandedName<'a> {
target.as_element().unwrap().name.expanded()
}
#[inline]
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
_flags: ElementFlags,
) -> NodeRef {
NodeRef::new_element(
name,
attrs.into_iter().map(|attr| {
let Attribute {
name: QualName { prefix, ns, local },
value,
} = attr;
let value = String::from(value);
(
attributes::ExpandedName { ns, local },
attributes::Attribute { prefix, value },
)
}),
)
}
#[inline]
fn create_comment(&mut self, text: StrTendril) -> NodeRef {
NodeRef::new_comment(text)
}
#[inline]
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> NodeRef {
NodeRef::new_processing_instruction(target, data)
}
#[inline]
fn append(&mut self, parent: &NodeRef, child: NodeOrText<NodeRef>) {
match child {
NodeOrText::AppendNode(node) => parent.append(node),
NodeOrText::AppendText(text) => {
if let Some(last_child) = parent.last_child() {
if let Some(existing) = last_child.as_text() {
existing.borrow_mut().push_str(&text);
return;
}
}
parent.append(NodeRef::new_text(text))
}
}
}
#[inline]
fn append_before_sibling(&mut self, sibling: &NodeRef, child: NodeOrText<NodeRef>) {
match child {
NodeOrText::AppendNode(node) => sibling.insert_before(node),
NodeOrText::AppendText(text) => {
if let Some(previous_sibling) = sibling.previous_sibling() {
if let Some(existing) = previous_sibling.as_text() {
existing.borrow_mut().push_str(&text);
return;
}
}
sibling.insert_before(NodeRef::new_text(text))
}
}
}
#[inline]
fn append_doctype_to_document(
&mut self,
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
) {
self.document_node
.append(NodeRef::new_doctype(name, public_id, system_id))
}
#[inline]
fn add_attrs_if_missing(&mut self, target: &NodeRef, attrs: Vec<Attribute>) {
let element = target.as_element().unwrap();
let mut attributes = element.attributes.borrow_mut();
for Attribute {
name: QualName { prefix, ns, local },
value,
} in attrs
{
attributes
.map
.entry(attributes::ExpandedName { ns, local })
.or_insert_with(|| {
let value = String::from(value);
attributes::Attribute { prefix, value }
});
}
}
#[inline]
fn remove_from_parent(&mut self, target: &NodeRef) {
target.detach()
}
#[inline]
fn reparent_children(&mut self, node: &NodeRef, new_parent: &NodeRef) {
// FIXME: Can this be done more effciently in rctree,
// by moving the whole linked list of children at once?
for child in node.children() {
new_parent.append(child)
}
}
#[inline]
fn mark_script_already_started(&mut self, _node: &NodeRef) {
// FIXME: Is this useful outside of a browser?
}
#[inline]
fn get_template_contents(&mut self, target: &NodeRef) -> NodeRef {
target
.as_element()
.unwrap()
.template_contents
.clone()
.unwrap()
}
fn append_based_on_parent_node(
&mut self,
element: &NodeRef,
prev_element: &NodeRef,
child: NodeOrText<NodeRef>,
) {
if element.parent().is_some() {
self.append_before_sibling(element, child)
} else {
self.append(prev_element, child)
}
}
}

433
kuchiki/src/select.rs Normal file
View File

@ -0,0 +1,433 @@
use crate::attributes::ExpandedName;
use cssparser::{self, CowRcStr, ParseError, SourceLocation, ToCss};
use html5ever::{LocalName, Namespace};
use crate::iter::{NodeIterator, Select};
use crate::node_data_ref::NodeDataRef;
use selectors::attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint};
use selectors::context::QuirksMode;
use selectors::parser::SelectorParseErrorKind;
use selectors::parser::{
NonTSPseudoClass, Parser, Selector as GenericSelector, SelectorImpl, SelectorList,
};
use selectors::{self, matching, OpaqueElement};
use std::fmt;
use crate::tree::{ElementData, Node, NodeData, NodeRef};
/// The definition of whitespace per CSS Selectors Level 3 § 4.
///
/// Copied from rust-selectors.
static SELECTOR_WHITESPACE: &[char] = &[' ', '\t', '\n', '\r', '\x0C'];
#[derive(Debug, Clone)]
pub struct KuchikiSelectors;
impl SelectorImpl for KuchikiSelectors {
type AttrValue = String;
type Identifier = LocalName;
type ClassName = LocalName;
type LocalName = LocalName;
type PartName = LocalName;
type NamespacePrefix = LocalName;
type NamespaceUrl = Namespace;
type BorrowedNamespaceUrl = Namespace;
type BorrowedLocalName = LocalName;
type NonTSPseudoClass = PseudoClass;
type PseudoElement = PseudoElement;
type ExtraMatchingData = ();
}
struct KuchikiParser;
impl<'i> Parser<'i> for KuchikiParser {
type Impl = KuchikiSelectors;
type Error = SelectorParseErrorKind<'i>;
fn parse_non_ts_pseudo_class(
&self,
location: SourceLocation,
name: CowRcStr<'i>,
) -> Result<PseudoClass, ParseError<'i, SelectorParseErrorKind<'i>>> {
use self::PseudoClass::*;
if name.eq_ignore_ascii_case("any-link") {
Ok(AnyLink)
} else if name.eq_ignore_ascii_case("link") {
Ok(Link)
} else if name.eq_ignore_ascii_case("visited") {
Ok(Visited)
} else if name.eq_ignore_ascii_case("active") {
Ok(Active)
} else if name.eq_ignore_ascii_case("focus") {
Ok(Focus)
} else if name.eq_ignore_ascii_case("hover") {
Ok(Hover)
} else if name.eq_ignore_ascii_case("enabled") {
Ok(Enabled)
} else if name.eq_ignore_ascii_case("disabled") {
Ok(Disabled)
} else if name.eq_ignore_ascii_case("checked") {
Ok(Checked)
} else if name.eq_ignore_ascii_case("indeterminate") {
Ok(Indeterminate)
} else {
Err(
location.new_custom_error(SelectorParseErrorKind::UnsupportedPseudoClassOrElement(
name,
)),
)
}
}
}
#[derive(PartialEq, Eq, Clone, Debug, Hash)]
pub enum PseudoClass {
AnyLink,
Link,
Visited,
Active,
Focus,
Hover,
Enabled,
Disabled,
Checked,
Indeterminate,
}
impl NonTSPseudoClass for PseudoClass {
type Impl = KuchikiSelectors;
fn is_active_or_hover(&self) -> bool {
matches!(*self, PseudoClass::Active | PseudoClass::Hover)
}
fn is_user_action_state(&self) -> bool {
matches!(*self, PseudoClass::Active | PseudoClass::Hover | PseudoClass::Focus)
}
fn has_zero_specificity(&self) -> bool {
false
}
}
impl ToCss for PseudoClass {
fn to_css<W>(&self, dest: &mut W) -> fmt::Result
where
W: fmt::Write,
{
dest.write_str(match *self {
PseudoClass::AnyLink => ":any-link",
PseudoClass::Link => ":link",
PseudoClass::Visited => ":visited",
PseudoClass::Active => ":active",
PseudoClass::Focus => ":focus",
PseudoClass::Hover => ":hover",
PseudoClass::Enabled => ":enabled",
PseudoClass::Disabled => ":disabled",
PseudoClass::Checked => ":checked",
PseudoClass::Indeterminate => ":indeterminate",
})
}
}
#[derive(PartialEq, Eq, Clone, Debug, Hash)]
pub enum PseudoElement {}
impl ToCss for PseudoElement {
fn to_css<W>(&self, _dest: &mut W) -> fmt::Result
where
W: fmt::Write,
{
match *self {}
}
}
impl selectors::parser::PseudoElement for PseudoElement {
type Impl = KuchikiSelectors;
}
impl selectors::Element for NodeDataRef<ElementData> {
type Impl = KuchikiSelectors;
#[inline]
fn opaque(&self) -> OpaqueElement {
let node: &Node = self.as_node();
OpaqueElement::new(node)
}
#[inline]
fn is_html_slot_element(&self) -> bool {
false
}
#[inline]
fn parent_node_is_shadow_root(&self) -> bool {
false
}
#[inline]
fn containing_shadow_host(&self) -> Option<Self> {
None
}
#[inline]
fn parent_element(&self) -> Option<Self> {
self.as_node().parent().and_then(NodeRef::into_element_ref)
}
#[inline]
fn prev_sibling_element(&self) -> Option<Self> {
self.as_node().preceding_siblings().elements().next()
}
#[inline]
fn next_sibling_element(&self) -> Option<Self> {
self.as_node().following_siblings().elements().next()
}
#[inline]
fn is_empty(&self) -> bool {
self.as_node().children().all(|child| match *child.data() {
NodeData::Element(_) => false,
NodeData::Text(ref text) => text.borrow().is_empty(),
_ => true,
})
}
#[inline]
fn is_root(&self) -> bool {
match self.as_node().parent() {
None => false,
Some(parent) => matches!(*parent.data(), NodeData::Document(_)),
}
}
#[inline]
fn is_html_element_in_html_document(&self) -> bool {
// FIXME: Have a notion of HTML document v.s. XML document?
self.name.ns == ns!(html)
}
#[inline]
fn has_local_name(&self, name: &LocalName) -> bool {
self.name.local == *name
}
#[inline]
fn has_namespace(&self, namespace: &Namespace) -> bool {
self.name.ns == *namespace
}
#[inline]
fn is_part(&self, _name: &LocalName) -> bool {
false
}
#[inline]
fn exported_part(&self, _: &LocalName) -> Option<LocalName> {
None
}
#[inline]
fn imported_part(&self, _: &LocalName) -> Option<LocalName> {
None
}
#[inline]
fn is_pseudo_element(&self) -> bool {
false
}
#[inline]
fn is_same_type(&self, other: &Self) -> bool {
self.name == other.name
}
#[inline]
fn is_link(&self) -> bool {
self.name.ns == ns!(html)
&& matches!(
self.name.local,
local_name!("a") | local_name!("area") | local_name!("link")
)
&& self
.attributes
.borrow()
.map
.contains_key(&ExpandedName::new(ns!(), local_name!("href")))
}
#[inline]
fn has_id(&self, id: &LocalName, case_sensitivity: CaseSensitivity) -> bool {
self.attributes
.borrow()
.get(local_name!("id"))
.map_or(false, |id_attr| {
case_sensitivity.eq(id.as_bytes(), id_attr.as_bytes())
})
}
#[inline]
fn has_class(&self, name: &LocalName, case_sensitivity: CaseSensitivity) -> bool {
let name = name.as_bytes();
!name.is_empty()
&& if let Some(class_attr) = self.attributes.borrow().get(local_name!("class")) {
class_attr
.split(SELECTOR_WHITESPACE)
.any(|class| case_sensitivity.eq(class.as_bytes(), name))
} else {
false
}
}
#[inline]
fn attr_matches(
&self,
ns: &NamespaceConstraint<&Namespace>,
local_name: &LocalName,
operation: &AttrSelectorOperation<&String>,
) -> bool {
let attrs = self.attributes.borrow();
match *ns {
NamespaceConstraint::Any => attrs
.map
.iter()
.any(|(name, attr)| name.local == *local_name && operation.eval_str(&attr.value)),
NamespaceConstraint::Specific(ns_url) => attrs
.map
.get(&ExpandedName::new(ns_url, local_name.clone()))
.map_or(false, |attr| operation.eval_str(&attr.value)),
}
}
fn match_pseudo_element(
&self,
pseudo: &PseudoElement,
_context: &mut matching::MatchingContext<KuchikiSelectors>,
) -> bool {
match *pseudo {}
}
fn match_non_ts_pseudo_class<F>(
&self,
pseudo: &PseudoClass,
_context: &mut matching::MatchingContext<KuchikiSelectors>,
_flags_setter: &mut F,
) -> bool
where
F: FnMut(&Self, matching::ElementSelectorFlags),
{
use self::PseudoClass::*;
match *pseudo {
Active | Focus | Hover | Enabled | Disabled | Checked | Indeterminate | Visited => {
false
}
AnyLink | Link => {
self.name.ns == ns!(html)
&& matches!(
self.name.local,
local_name!("a") | local_name!("area") | local_name!("link")
)
&& self.attributes.borrow().contains(local_name!("href"))
}
}
}
}
/// A pre-compiled list of CSS Selectors.
pub struct Selectors(pub Vec<Selector>);
/// A pre-compiled CSS Selector.
pub struct Selector(GenericSelector<KuchikiSelectors>);
/// The specificity of a selector.
///
/// Opaque, but ordered.
///
/// Determines precedence in the cascading algorithm.
/// When equal, a rule later in source order takes precedence.
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
pub struct Specificity(u32);
impl Selectors {
/// Compile a list of selectors. This may fail on syntax errors or unsupported selectors.
#[inline]
pub fn compile(s: &str) -> Result<Selectors, ()> {
let mut input = cssparser::ParserInput::new(s);
match SelectorList::parse(&KuchikiParser, &mut cssparser::Parser::new(&mut input)) {
Ok(list) => Ok(Selectors(list.0.into_iter().map(Selector).collect())),
Err(_) => Err(()),
}
}
/// Returns whether the given element matches this list of selectors.
#[inline]
pub fn matches(&self, element: &NodeDataRef<ElementData>) -> bool {
self.0.iter().any(|s| s.matches(element))
}
/// Filter an element iterator, yielding those matching this list of selectors.
#[inline]
pub fn filter<I>(&self, iter: I) -> Select<I, &Selectors>
where
I: Iterator<Item = NodeDataRef<ElementData>>,
{
Select {
iter,
selectors: self,
}
}
}
impl Selector {
/// Returns whether the given element matches this selector.
#[inline]
pub fn matches(&self, element: &NodeDataRef<ElementData>) -> bool {
let mut context = matching::MatchingContext::new(
matching::MatchingMode::Normal,
None,
None,
QuirksMode::NoQuirks,
);
matching::matches_selector(&self.0, 0, None, element, &mut context, &mut |_, _| {})
}
/// Return the specificity of this selector.
pub fn specificity(&self) -> Specificity {
Specificity(self.0.specificity())
}
}
impl ::std::str::FromStr for Selectors {
type Err = ();
#[inline]
fn from_str(s: &str) -> Result<Selectors, ()> {
Selectors::compile(s)
}
}
impl fmt::Display for Selector {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.0.to_css(f)
}
}
impl fmt::Display for Selectors {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut iter = self.0.iter();
let first = iter
.next()
.expect("Empty Selectors, should contain at least one selector");
first.0.to_css(f)?;
for selector in iter {
f.write_str(", ")?;
selector.0.to_css(f)?;
}
Ok(())
}
}
impl fmt::Debug for Selector {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
impl fmt::Debug for Selectors {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}

105
kuchiki/src/serializer.rs Normal file
View File

@ -0,0 +1,105 @@
use html5ever::serialize::TraversalScope::*;
use html5ever::serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope};
use html5ever::QualName;
use std::fs::File;
use std::io::{Result, Write};
use std::path::Path;
use std::string::ToString;
use crate::tree::{NodeData, NodeRef};
impl Serialize for NodeRef {
fn serialize<S: Serializer>(
&self,
serializer: &mut S,
traversal_scope: TraversalScope,
) -> Result<()> {
match (traversal_scope, self.data()) {
(ref scope, &NodeData::Element(ref element)) => {
if *scope == IncludeNode {
let attrs = element.attributes.borrow();
// Unfortunately we need to allocate something to hold these &'a QualName
let attrs = attrs
.map
.iter()
.map(|(name, attr)| {
(
QualName::new(
attr.prefix.clone(),
name.ns.clone(),
name.local.clone(),
),
&attr.value,
)
})
.collect::<Vec<_>>();
serializer.start_elem(
element.name.clone(),
attrs.iter().map(|&(ref name, value)| (name, &**value)),
)?
}
for child in self.children() {
Serialize::serialize(&child, serializer, IncludeNode)?
}
if *scope == IncludeNode {
serializer.end_elem(element.name.clone())?
}
Ok(())
}
(_, &NodeData::DocumentFragment) | (_, &NodeData::Document(_)) => {
for child in self.children() {
Serialize::serialize(&child, serializer, IncludeNode)?
}
Ok(())
}
(ChildrenOnly(_), _) => Ok(()),
(IncludeNode, &NodeData::Doctype(ref doctype)) => {
serializer.write_doctype(&doctype.name)
}
(IncludeNode, &NodeData::Text(ref text)) => serializer.write_text(&text.borrow()),
(IncludeNode, &NodeData::Comment(ref text)) => serializer.write_comment(&text.borrow()),
(IncludeNode, &NodeData::ProcessingInstruction(ref contents)) => {
let contents = contents.borrow();
serializer.write_processing_instruction(&contents.0, &contents.1)
}
}
}
}
impl ToString for NodeRef {
#[inline]
fn to_string(&self) -> String {
let mut u8_vec = Vec::new();
self.serialize(&mut u8_vec).unwrap();
String::from_utf8(u8_vec).unwrap()
}
}
impl NodeRef {
/// Serialize this node and its descendants in HTML syntax to the given stream.
#[inline]
pub fn serialize<W: Write>(&self, writer: &mut W) -> Result<()> {
serialize(
writer,
self,
SerializeOpts {
traversal_scope: IncludeNode,
..Default::default()
},
)
}
/// Serialize this node and its descendants in HTML syntax to a new file at the given path.
#[inline]
pub fn serialize_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
let mut file = File::create(&path)?;
self.serialize(&mut file)
}
}

185
kuchiki/src/tests.rs Normal file
View File

@ -0,0 +1,185 @@
use html5ever::tree_builder::QuirksMode;
use html5ever::QualName;
use std::path::Path;
use tempfile::TempDir;
use crate::parser::{parse_html, parse_fragment};
use crate::select::*;
use crate::traits::*;
#[test]
fn text_nodes() {
let html = r"
<!doctype html>
<title>Test case</title>
<p>Content contains <b>Important</b> data</p>";
let document = parse_html().one(html);
let paragraph = document.select("p").unwrap().collect::<Vec<_>>();
assert_eq!(paragraph.len(), 1);
assert_eq!(
paragraph[0].text_contents(),
"Content contains Important data"
);
let texts = paragraph[0]
.as_node()
.descendants()
.text_nodes()
.collect::<Vec<_>>();
assert_eq!(texts.len(), 3);
assert_eq!(&*texts[0].borrow(), "Content contains ");
assert_eq!(&*texts[1].borrow(), "Important");
assert_eq!(&*texts[2].borrow(), " data");
{
let mut x = texts[0].borrow_mut();
x.truncate(0);
x.push_str("Content doesn't contain ");
}
assert_eq!(&*texts[0].borrow(), "Content doesn't contain ");
}
#[test]
fn parse_and_serialize() {
let html = r"
<!doctype html>
<title>Test case</title>
<p>Content";
let document = parse_html().one(html);
assert_eq!(
document.as_document().unwrap().quirks_mode(),
QuirksMode::NoQuirks
);
assert_eq!(
document.to_string(),
r"<!DOCTYPE html><html><head><title>Test case</title>
</head><body><p>Content</p></body></html>"
);
}
#[test]
fn parse_and_serialize_fragment() {
let html = r"<tbody><tr><td>Test case";
let ctx_name = QualName::new(None, ns!(html), local_name!("tbody"));
let document = parse_fragment(ctx_name, vec![]).one(html);
assert_eq!(document.as_document().unwrap().quirks_mode(), QuirksMode::NoQuirks);
assert_eq!(document.to_string(), r"<html><tr><td>Test case</td></tr></html>");
}
#[test]
fn parse_file() {
let mut path = Path::new(env!("CARGO_MANIFEST_DIR")).to_path_buf();
path.push("test_data".to_string());
path.push("foo.html");
let html = r"<!DOCTYPE html><html><head>
<title>Test case</title>
</head>
<body>
<p>Foo</p>
</body></html>";
let document = parse_html().from_utf8().from_file(&path).unwrap();
assert_eq!(document.to_string(), html);
}
#[test]
fn serialize_and_read_file() {
let tempdir = TempDir::new().unwrap();
let mut path = tempdir.path().to_path_buf();
path.push("temp.html");
let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>";
let document = parse_html().one(html);
let _ = document.serialize_to_file(path.clone());
let document2 = parse_html().from_utf8().from_file(&path).unwrap();
assert_eq!(document.to_string(), document2.to_string());
}
#[test]
fn select() {
let html = r"
<title>Test case</title>
<p class=foo>Foo
<p>Bar
<p class=foo>Foo
";
let document = parse_html().one(html);
let matching = document.select("p.foo").unwrap().collect::<Vec<_>>();
assert_eq!(matching.len(), 2);
let child = matching[0].as_node().first_child().unwrap();
assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
assert_eq!(matching[0].attributes.borrow().get("class"), Some("foo"));
assert_eq!(
matching[0].attributes.borrow().get(local_name!("class")),
Some("foo")
);
let selectors = Selectors::compile("p.foo").unwrap();
let matching2 = selectors
.filter(document.descendants().elements())
.collect::<Vec<_>>();
assert_eq!(matching, matching2);
}
#[test]
fn select_first() {
let html = r"
<title>Test case</title>
<p class=foo>Foo
<p>Bar
<p class=foo>Baz
";
let document = parse_html().one(html);
let matching = document.select_first("p.foo").unwrap();
let child = matching.as_node().first_child().unwrap();
assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
assert_eq!(matching.attributes.borrow().get("class"), Some("foo"));
assert_eq!(
matching.attributes.borrow().get(local_name!("class")),
Some("foo")
);
assert!(document.select_first("p.bar").is_err());
}
#[test]
fn to_string() {
let html = r"<!DOCTYPE html>
<html>
<head>
<title>Test case</title>
</head>
<body>
<p class=foo>Foo
</body>
</html>";
let document = parse_html().one(html);
assert_eq!(
document
.inclusive_descendants()
.nth(11)
.unwrap()
.to_string(),
"<p class=\"foo\">Foo\n \n</p>"
);
}
#[test]
fn specificity() {
let selectors = Selectors::compile(".example, :first-child, div").unwrap();
let specificities = selectors
.0
.iter()
.map(|s| s.specificity())
.collect::<Vec<_>>();
assert_eq!(specificities.len(), 3);
assert!(specificities[0] == specificities[1]);
assert!(specificities[0] > specificities[2]);
assert!(specificities[1] > specificities[2]);
}

489
kuchiki/src/tree.rs Normal file
View File

@ -0,0 +1,489 @@
use html5ever::tree_builder::QuirksMode;
use html5ever::QualName;
use std::cell::{Cell, RefCell};
use std::fmt;
use std::ops::Deref;
use std::rc::{Rc, Weak};
use crate::attributes::{Attribute, Attributes, ExpandedName};
use crate::cell_extras::*;
use crate::iter::NodeIterator;
/// Node data specific to the node type.
#[derive(Debug, PartialEq, Clone)]
pub enum NodeData {
/// Element node
Element(ElementData),
/// Text node
Text(RefCell<String>),
/// Comment node
Comment(RefCell<String>),
/// Processing instruction node
ProcessingInstruction(RefCell<(String, String)>),
/// Doctype node
Doctype(Doctype),
/// Document node
Document(DocumentData),
/// Document fragment node
DocumentFragment,
}
/// Data specific to doctype nodes.
#[derive(Debug, PartialEq, Clone)]
pub struct Doctype {
/// The name of the doctype
pub name: String,
/// The public ID of the doctype
pub public_id: String,
/// The system ID of the doctype
pub system_id: String,
}
/// Data specific to element nodes.
#[derive(Debug, PartialEq, Clone)]
pub struct ElementData {
/// The namespace and local name of the element, such as `ns!(html)` and `body`.
pub name: QualName,
/// The attributes of the elements.
pub attributes: RefCell<Attributes>,
/// If the element is an HTML `<template>` element,
/// the document fragment node that is the root of template contents.
pub template_contents: Option<NodeRef>,
}
/// Data specific to document nodes.
#[derive(Debug, PartialEq, Clone)]
pub struct DocumentData {
#[doc(hidden)]
pub _quirks_mode: Cell<QuirksMode>,
}
impl DocumentData {
/// The quirks mode of the document, as determined by the HTML parser.
#[inline]
pub fn quirks_mode(&self) -> QuirksMode {
self._quirks_mode.get()
}
}
/// A strong reference to a node.
///
/// A node is destroyed when the last strong reference to it dropped.
///
/// Each node holds a strong reference to its first child and next sibling (if any),
/// but only a weak reference to its last child, previous sibling, and parent.
/// This is to avoid strong reference cycles, which would cause memory leaks.
///
/// As a result, a single `NodeRef` is sufficient to keep alive a node
/// and nodes that are after it in tree order
/// (its descendants, its following siblings, and their descendants)
/// but not other nodes in a tree.
///
/// To avoid detroying nodes prematurely,
/// programs typically hold a strong reference to the root of a document
/// until theyre done with that document.
#[derive(Clone, Debug)]
pub struct NodeRef(pub Rc<Node>);
impl Deref for NodeRef {
type Target = Node;
#[inline]
fn deref(&self) -> &Node {
&*self.0
}
}
impl Eq for NodeRef {}
impl PartialEq for NodeRef {
#[inline]
fn eq(&self, other: &NodeRef) -> bool {
let a: *const Node = &*self.0;
let b: *const Node = &*other.0;
a == b
}
}
/// A node inside a DOM-like tree.
pub struct Node {
parent: Cell<Option<Weak<Node>>>,
previous_sibling: Cell<Option<Weak<Node>>>,
next_sibling: Cell<Option<Rc<Node>>>,
first_child: Cell<Option<Rc<Node>>>,
last_child: Cell<Option<Weak<Node>>>,
data: NodeData,
}
impl fmt::Debug for Node {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "{:?} @ {:?}", self.data, self as *const Node)
}
}
/// Prevent implicit recursion when dropping nodes to avoid overflowing the stack.
///
/// The implicit drop is correct, but recursive.
/// In the worst case (where no node has both a next sibling and a child),
/// a tree of a few tens of thousands of nodes could cause a stack overflow.
///
/// This `Drop` implementations makes sure the recursion does not happen.
/// Instead, it has an explicit `Vec<Rc<Node>>` stack to traverse the subtree,
/// but only following `Rc<Node>` references that are "unique":
/// that have a strong reference count of 1.
/// Those are the nodes that would have been dropped recursively.
///
/// The stack holds ancestors of the current node rather than preceding siblings,
/// on the assumption that large document trees are typically wider than deep.
impl Drop for Node {
fn drop(&mut self) {
// `.take_if_unique_strong()` temporarily leaves the tree in an inconsistent state,
// as the corresponding `Weak` reference in the other direction is not removed.
// It is important that all `Some(_)` strong references it returns
// are dropped by the end of this `drop` call,
// and that no user code is invoked in-between.
// Sharing `stack` between these two calls is not necessary,
// but it allows re-using memory allocations.
let mut stack = Vec::new();
if let Some(rc) = self.first_child.take_if_unique_strong() {
non_recursive_drop_unique_rc(rc, &mut stack);
}
if let Some(rc) = self.next_sibling.take_if_unique_strong() {
non_recursive_drop_unique_rc(rc, &mut stack);
}
fn non_recursive_drop_unique_rc(mut rc: Rc<Node>, stack: &mut Vec<Rc<Node>>) {
loop {
if let Some(child) = rc.first_child.take_if_unique_strong() {
stack.push(rc);
rc = child;
continue;
}
if let Some(sibling) = rc.next_sibling.take_if_unique_strong() {
// The previous value of `rc: Rc<Node>` is dropped here.
// Since it was unique, the corresponding `Node` is dropped as well.
// `<Node as Drop>::drop` does not call `drop_rc`
// as both the first child and next sibling were already taken.
// Weak reference counts decremented here for `Cell`s that are `Some`:
// * `rc.parent`: still has a strong reference in `stack` or elsewhere
// * `rc.last_child`: this is the last weak ref. Deallocated now.
// * `rc.previous_sibling`: this is the last weak ref. Deallocated now.
rc = sibling;
continue;
}
if let Some(parent) = stack.pop() {
// Same as in the above comment.
rc = parent;
continue;
}
return;
}
}
}
}
impl NodeRef {
/// Create a new node.
#[inline]
pub fn new(data: NodeData) -> NodeRef {
NodeRef(Rc::new(Node {
parent: Cell::new(None),
first_child: Cell::new(None),
last_child: Cell::new(None),
previous_sibling: Cell::new(None),
next_sibling: Cell::new(None),
data,
}))
}
/// Create a new element node.
#[inline]
pub fn new_element<I>(name: QualName, attributes: I) -> NodeRef
where
I: IntoIterator<Item = (ExpandedName, Attribute)>,
{
NodeRef::new(NodeData::Element(ElementData {
template_contents: if name.expanded() == expanded_name!(html "template") {
Some(NodeRef::new(NodeData::DocumentFragment))
} else {
None
},
name,
attributes: RefCell::new(Attributes {
map: attributes.into_iter().collect(),
}),
}))
}
/// Create a new text node.
#[inline]
pub fn new_text<T: Into<String>>(value: T) -> NodeRef {
NodeRef::new(NodeData::Text(RefCell::new(value.into())))
}
/// Create a new comment node.
#[inline]
pub fn new_comment<T: Into<String>>(value: T) -> NodeRef {
NodeRef::new(NodeData::Comment(RefCell::new(value.into())))
}
/// Create a new processing instruction node.
#[inline]
pub fn new_processing_instruction<T1, T2>(target: T1, data: T2) -> NodeRef
where
T1: Into<String>,
T2: Into<String>,
{
NodeRef::new(NodeData::ProcessingInstruction(RefCell::new((
target.into(),
data.into(),
))))
}
/// Create a new doctype node.
#[inline]
pub fn new_doctype<T1, T2, T3>(name: T1, public_id: T2, system_id: T3) -> NodeRef
where
T1: Into<String>,
T2: Into<String>,
T3: Into<String>,
{
NodeRef::new(NodeData::Doctype(Doctype {
name: name.into(),
public_id: public_id.into(),
system_id: system_id.into(),
}))
}
/// Create a new document node.
#[inline]
pub fn new_document() -> NodeRef {
NodeRef::new(NodeData::Document(DocumentData {
_quirks_mode: Cell::new(QuirksMode::NoQuirks),
}))
}
/// Return the concatenation of all text nodes in this subtree.
pub fn text_contents(&self) -> String {
let mut s = String::new();
for text_node in self.inclusive_descendants().text_nodes() {
s.push_str(&text_node.borrow());
}
s
}
}
impl Node {
/// Return a reference to this nodes node-type-specific data.
#[inline]
pub fn data(&self) -> &NodeData {
&self.data
}
/// If this node is an element, return a reference to element-specific data.
#[inline]
pub fn as_element(&self) -> Option<&ElementData> {
match self.data {
NodeData::Element(ref value) => Some(value),
_ => None,
}
}
/// If this node is a text node, return a reference to its contents.
#[inline]
pub fn as_text(&self) -> Option<&RefCell<String>> {
match self.data {
NodeData::Text(ref value) => Some(value),
_ => None,
}
}
/// If this node is a comment, return a reference to its contents.
#[inline]
pub fn as_comment(&self) -> Option<&RefCell<String>> {
match self.data {
NodeData::Comment(ref value) => Some(value),
_ => None,
}
}
/// If this node is a document, return a reference to doctype-specific data.
#[inline]
pub fn as_doctype(&self) -> Option<&Doctype> {
match self.data {
NodeData::Doctype(ref value) => Some(value),
_ => None,
}
}
/// If this node is a document, return a reference to document-specific data.
#[inline]
pub fn as_document(&self) -> Option<&DocumentData> {
match self.data {
NodeData::Document(ref value) => Some(value),
_ => None,
}
}
/// Return a reference to the parent node, unless this node is the root of the tree.
#[inline]
pub fn parent(&self) -> Option<NodeRef> {
self.parent.upgrade().map(NodeRef)
}
/// Return a reference to the first child of this node, unless it has no child.
#[inline]
pub fn first_child(&self) -> Option<NodeRef> {
self.first_child.clone_inner().map(NodeRef)
}
/// Return a reference to the last child of this node, unless it has no child.
#[inline]
pub fn last_child(&self) -> Option<NodeRef> {
self.last_child.upgrade().map(NodeRef)
}
/// Return a reference to the previous sibling of this node, unless it is a first child.
#[inline]
pub fn previous_sibling(&self) -> Option<NodeRef> {
self.previous_sibling.upgrade().map(NodeRef)
}
/// Return a reference to the next sibling of this node, unless it is a last child.
#[inline]
pub fn next_sibling(&self) -> Option<NodeRef> {
self.next_sibling.clone_inner().map(NodeRef)
}
/// Detach a node from its parent and siblings. Children are not affected.
///
/// To remove a node and its descendants, detach it and drop any strong reference to it.
pub fn detach(&self) {
let parent_weak = self.parent.take();
let previous_sibling_weak = self.previous_sibling.take();
let next_sibling_strong = self.next_sibling.take();
let previous_sibling_opt = previous_sibling_weak
.as_ref()
.and_then(|weak| weak.upgrade());
if let Some(next_sibling_ref) = next_sibling_strong.as_ref() {
next_sibling_ref
.previous_sibling
.replace(previous_sibling_weak);
} else if let Some(parent_ref) = parent_weak.as_ref() {
if let Some(parent_strong) = parent_ref.upgrade() {
parent_strong.last_child.replace(previous_sibling_weak);
}
}
if let Some(previous_sibling_strong) = previous_sibling_opt {
previous_sibling_strong
.next_sibling
.replace(next_sibling_strong);
} else if let Some(parent_ref) = parent_weak.as_ref() {
if let Some(parent_strong) = parent_ref.upgrade() {
parent_strong.first_child.replace(next_sibling_strong);
}
}
}
}
impl NodeRef {
/// Append a new child to this node, after existing children.
///
/// The new child is detached from its previous position.
pub fn append(&self, new_child: NodeRef) {
new_child.detach();
new_child.parent.replace(Some(Rc::downgrade(&self.0)));
if let Some(last_child_weak) = self.last_child.replace(Some(Rc::downgrade(&new_child.0))) {
if let Some(last_child) = last_child_weak.upgrade() {
new_child.previous_sibling.replace(Some(last_child_weak));
debug_assert!(last_child.next_sibling.is_none());
last_child.next_sibling.replace(Some(new_child.0));
return;
}
}
debug_assert!(self.first_child.is_none());
self.first_child.replace(Some(new_child.0));
}
/// Prepend a new child to this node, before existing children.
///
/// The new child is detached from its previous position.
pub fn prepend(&self, new_child: NodeRef) {
new_child.detach();
new_child.parent.replace(Some(Rc::downgrade(&self.0)));
if let Some(first_child) = self.first_child.take() {
debug_assert!(first_child.previous_sibling.is_none());
first_child
.previous_sibling
.replace(Some(Rc::downgrade(&new_child.0)));
new_child.next_sibling.replace(Some(first_child));
} else {
debug_assert!(self.first_child.is_none());
self.last_child.replace(Some(Rc::downgrade(&new_child.0)));
}
self.first_child.replace(Some(new_child.0));
}
/// Insert a new sibling after this node.
///
/// The new sibling is detached from its previous position.
pub fn insert_after(&self, new_sibling: NodeRef) {
new_sibling.detach();
new_sibling.parent.replace(self.parent.clone_inner());
new_sibling
.previous_sibling
.replace(Some(Rc::downgrade(&self.0)));
if let Some(next_sibling) = self.next_sibling.take() {
debug_assert!(next_sibling.previous_sibling().unwrap() == *self);
next_sibling
.previous_sibling
.replace(Some(Rc::downgrade(&new_sibling.0)));
new_sibling.next_sibling.replace(Some(next_sibling));
} else if let Some(parent) = self.parent() {
debug_assert!(parent.last_child().unwrap() == *self);
parent
.last_child
.replace(Some(Rc::downgrade(&new_sibling.0)));
}
self.next_sibling.replace(Some(new_sibling.0));
}
/// Insert a new sibling before this node.
///
/// The new sibling is detached from its previous position.
pub fn insert_before(&self, new_sibling: NodeRef) {
new_sibling.detach();
new_sibling.parent.replace(self.parent.clone_inner());
new_sibling.next_sibling.replace(Some(self.0.clone()));
if let Some(previous_sibling_weak) = self
.previous_sibling
.replace(Some(Rc::downgrade(&new_sibling.0)))
{
if let Some(previous_sibling) = previous_sibling_weak.upgrade() {
new_sibling
.previous_sibling
.replace(Some(previous_sibling_weak));
debug_assert!(previous_sibling.next_sibling().unwrap() == *self);
previous_sibling.next_sibling.replace(Some(new_sibling.0));
return;
}
}
if let Some(parent) = self.parent() {
debug_assert!(parent.first_child().unwrap() == *self);
parent.first_child.replace(Some(new_sibling.0));
}
}
}

View File

@ -0,0 +1,9 @@
<!DOCTYPE html>
<html>
<head>
<title>Test case</title>
</head>
<body>
<p>Foo</p>
</body>
</html>