diff --git a/Cargo.toml b/Cargo.toml index 9fae8e5..a157627 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,4 +3,5 @@ members = [ "bin", "sanitize-html-rs", + "kuchiki", ] diff --git a/kuchiki/.gitignore b/kuchiki/.gitignore new file mode 100644 index 0000000..884cb47 --- /dev/null +++ b/kuchiki/.gitignore @@ -0,0 +1,3 @@ +target +Cargo.lock +.cargo/config diff --git a/kuchiki/.travis.yml b/kuchiki/.travis.yml new file mode 100644 index 0000000..017d7c3 --- /dev/null +++ b/kuchiki/.travis.yml @@ -0,0 +1,6 @@ +sudo: false +language: rust +rust: + - nightly + - beta + - stable diff --git a/kuchiki/Cargo.toml b/kuchiki/Cargo.toml new file mode 100644 index 0000000..4e7229b --- /dev/null +++ b/kuchiki/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "kuchiki" +version = "0.8.1" +authors = ["Simon Sapin "] +license = "MIT" +description = "(朽木) HTML/XML tree manipulation library" +repository = "https://github.com/kuchiki-rs/kuchiki" +edition = "2018" + +[lib] +name = "kuchiki" +doctest = false + +[dependencies] +cssparser = "0.27" +matches = "0.1.4" +html5ever = "0.25" +selectors = "0.22" +indexmap = "1.6.0" + +[dev-dependencies] +tempfile = "3" diff --git a/kuchiki/LICENSE b/kuchiki/LICENSE new file mode 100644 index 0000000..31aa793 --- /dev/null +++ b/kuchiki/LICENSE @@ -0,0 +1,23 @@ +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/kuchiki/README.md b/kuchiki/README.md new file mode 100644 index 0000000..30d4f54 --- /dev/null +++ b/kuchiki/README.md @@ -0,0 +1,10 @@ +Kuchiki (朽木) +============== + +HTML/XML¹ tree manipulation library for Rust. + +[Documentation](https://docs.rs/kuchiki/) + +See [users.rust-lang.org discussion](http://users.rust-lang.org/t/kuchiki-a-vaporware-html-xml-tree-manipulation-library/435). + +¹ There is no support for XML syntax yet. The plan is to integrate with an existing parser. diff --git a/kuchiki/docs/.nojekyll b/kuchiki/docs/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/kuchiki/docs/404.html b/kuchiki/docs/404.html new file mode 100644 index 0000000..9fef978 --- /dev/null +++ b/kuchiki/docs/404.html @@ -0,0 +1,3 @@ + + +Moved to docs.rs diff --git a/kuchiki/docs/index.html b/kuchiki/docs/index.html new file mode 100644 index 0000000..9fef978 --- /dev/null +++ b/kuchiki/docs/index.html @@ -0,0 +1,3 @@ + + +Moved to docs.rs diff --git a/kuchiki/examples/find_matches.rs b/kuchiki/examples/find_matches.rs new file mode 100644 index 0000000..848e08e --- /dev/null +++ b/kuchiki/examples/find_matches.rs @@ -0,0 +1,48 @@ +extern crate kuchiki; + +use kuchiki::traits::*; + +fn main() { + let html = r" + + + + +

Example

+

Hello, world!

+

I love HTML

+ + + "; + let css_selector = ".foo"; + + let document = kuchiki::parse_html().one(html); + + for css_match in document.select(css_selector).unwrap() { + // css_match is a NodeDataRef, but most of the interesting methods are + // on NodeRef. Let's get the underlying NodeRef. + let as_node = css_match.as_node(); + + // In this example, as_node represents an HTML node like + // + //

Hello world!

" + // + // Which is distinct from just 'Hello world!'. To get rid of that

+ // tag, we're going to get each element's first child, which will be + // a "text" node. + // + // There are other kinds of nodes, of course. The possibilities are all + // listed in the `NodeData` enum in this crate. + let text_node = as_node.first_child().unwrap(); + + // Let's get the actual text in this text node. A text node wraps around + // a RefCell, so we need to call borrow() to get a &str out. + let text = text_node.as_text().unwrap().borrow(); + + // Prints: + // + // "Hello, world!" + // "I love HTML" + println!("{:?}", text); + } +} diff --git a/kuchiki/examples/stack-overflow.rs b/kuchiki/examples/stack-overflow.rs new file mode 100644 index 0000000..535b702 --- /dev/null +++ b/kuchiki/examples/stack-overflow.rs @@ -0,0 +1,22 @@ +extern crate kuchiki; + +fn main() { + let mut depth = 2; + // 20 M nodes is a few GB of memory. + while depth <= 20_000_000 { + let mut node = kuchiki::NodeRef::new_text(""); + for _ in 0..depth { + let parent = kuchiki::NodeRef::new_text(""); + parent.append(node); + node = parent; + } + + println!("Trying to drop {} nodes...", depth); + // Without an explicit `impl Drop for Node`, + // depth = 20_000 causes "thread '

' has overflowed its stack" + // on my machine (Linux x86_64). + ::std::mem::drop(node); + + depth *= 10; + } +} diff --git a/kuchiki/src/attributes.rs b/kuchiki/src/attributes.rs new file mode 100644 index 0000000..655585e --- /dev/null +++ b/kuchiki/src/attributes.rs @@ -0,0 +1,83 @@ +use html5ever::{LocalName, Namespace, Prefix}; +use indexmap::{map::Entry, IndexMap}; + +/// Convenience wrapper around a indexmap that adds method for attributes in the null namespace. +#[derive(Debug, PartialEq, Clone)] +pub struct Attributes { + /// A map of attributes whose name can have namespaces. + pub map: IndexMap, +} + +/// +#[derive(Debug, PartialEq, Eq, Hash, Clone, PartialOrd, Ord)] +pub struct ExpandedName { + /// Namespace URL + pub ns: Namespace, + /// "Local" part of the name + pub local: LocalName, +} + +impl ExpandedName { + /// Trivial constructor + pub fn new, L: Into>(ns: N, local: L) -> Self { + ExpandedName { + ns: ns.into(), + local: local.into(), + } + } +} + +/// The non-identifying parts of an attribute +#[derive(Debug, PartialEq, Clone)] +pub struct Attribute { + /// The namespace prefix, if any + pub prefix: Option, + /// The attribute value + pub value: String, +} + +impl Attributes { + /// Like IndexMap::contains + pub fn contains>(&self, local_name: A) -> bool { + self.map.contains_key(&ExpandedName::new(ns!(), local_name)) + } + + /// Like IndexMap::get + pub fn get>(&self, local_name: A) -> Option<&str> { + self.map + .get(&ExpandedName::new(ns!(), local_name)) + .map(|attr| &*attr.value) + } + + /// Like IndexMap::get_mut + pub fn get_mut>(&mut self, local_name: A) -> Option<&mut String> { + self.map + .get_mut(&ExpandedName::new(ns!(), local_name)) + .map(|attr| &mut attr.value) + } + + /// Like IndexMap::entry + pub fn entry>(&mut self, local_name: A) -> Entry { + self.map.entry(ExpandedName::new(ns!(), local_name)) + } + + /// Like IndexMap::insert + pub fn insert>( + &mut self, + local_name: A, + value: String, + ) -> Option { + self.map.insert( + ExpandedName::new(ns!(), local_name), + Attribute { + prefix: None, + value, + }, + ) + } + + /// Like IndexMap::remove + pub fn remove>(&mut self, local_name: A) -> Option { + self.map.remove(&ExpandedName::new(ns!(), local_name)) + } +} diff --git a/kuchiki/src/cell_extras.rs b/kuchiki/src/cell_extras.rs new file mode 100644 index 0000000..4c7538f --- /dev/null +++ b/kuchiki/src/cell_extras.rs @@ -0,0 +1,113 @@ +//! Specialized methods for `Cell` of some specific `!Copy` types, +//! allowing limited access to a value without moving it of the cell. +//! +//! +//! # Soundness +//! +//! These methods use and `Cell::as_ptr` and `unsafe`. +//! Their soundness lies in that: +//! +//! * `Cell: !Sync` for any `T`, so no other thread is accessing this cell. +//! * For the duration of the raw pointer access, +//! this thread only runs code that is known to not access the same cell again. +//! In particular, no method of a type paramater is called. +//! For example, `clone_inner` would be unsound to generalize to any `Cell` +//! because it would involve running arbitrary code through `T::clone` +//! and provide that code with a reference to the inside of the cell. +//! +//! ```rust +//! struct Evil(Box, Rc>>); +//! impl Clone for Evil { +//! fn clone(&self) -> Self { +//! mem::drop(self.1.take()); // Mess with the "other" node, which might be `self`. +//! Evil( +//! self.0.clone(), // possible use after free! +//! Rc::new(Cell::new(None)) +//! ) +//! } +//! } +//! let a = Rc::new(Cell::new(None)); +//! a.set(Some(Evil(Box::new(5), a.clone()))); // Make a reference cycle. +//! a.clone_inner(); +//! ``` +//! +//! `Rc::clone` and `Weak::clone` do not have this problem +//! as they only increment reference counts and never call `T::clone`. +//! +//! +//! # Alternative +//! +//! To avoid using `unsafe` entirely, operating on a `T: !Copy` value inside a `Cell` +//! would require temporarily replacing it with a default value: +//! +//! ```rust +//! fn option_dance(cell: &Cell, f: F) -> R +//! where T: Default, F: FnOnce(&mut T) -> R +//! { +//! let mut value = cell.take(); +//! let result = f(&mut value); +//! cell.set(value); +//! result +//! } +//! ``` +//! +//! It would be worth exploring whether LLVM can reliably optimize away these extra moves +//! and compile the `Option` dance to assembly similar to that of the `unsafe` operation. + +use std::cell::Cell; +use std::rc::{Rc, Weak}; + +pub trait CellOption { + fn is_none(&self) -> bool; +} + +impl CellOption for Cell> { + #[inline] + fn is_none(&self) -> bool { + unsafe { (*self.as_ptr()).is_none() } + } +} + +pub trait CellOptionWeak { + fn upgrade(&self) -> Option>; + fn clone_inner(&self) -> Option>; +} + +impl CellOptionWeak for Cell>> { + #[inline] + fn upgrade(&self) -> Option> { + unsafe { (*self.as_ptr()).as_ref().and_then(Weak::upgrade) } + } + + #[inline] + fn clone_inner(&self) -> Option> { + unsafe { (*self.as_ptr()).clone() } + } +} + +pub trait CellOptionRc { + /// Return `Some` if this `Rc` is the only strong reference count, + /// even if there are weak references. + fn take_if_unique_strong(&self) -> Option>; + fn clone_inner(&self) -> Option>; +} + +impl CellOptionRc for Cell>> { + #[inline] + fn take_if_unique_strong(&self) -> Option> { + unsafe { + match *self.as_ptr() { + None => None, + Some(ref rc) if Rc::strong_count(rc) > 1 => None, + // Not borrowing the `Rc` here + // as we would be invalidating that borrow while it is outstanding: + Some(_) => self.take(), + } + } + } + + #[inline] + fn clone_inner(&self) -> Option> { + unsafe { (*self.as_ptr()).clone() } + } +} diff --git a/kuchiki/src/iter.rs b/kuchiki/src/iter.rs new file mode 100644 index 0000000..75fcfc4 --- /dev/null +++ b/kuchiki/src/iter.rs @@ -0,0 +1,452 @@ +//! Node iterators + +use std::borrow::Borrow; +use std::cell::RefCell; +use std::iter::Rev; + +use crate::node_data_ref::NodeDataRef; +use crate::select::Selectors; +use crate::tree::{ElementData, NodeRef}; + +impl NodeRef { + /// Return an iterator of references to this node and its ancestors. + #[inline] + pub fn inclusive_ancestors(&self) -> Ancestors { + Ancestors(Some(self.clone())) + } + + /// Return an iterator of references to this node’s ancestors. + #[inline] + pub fn ancestors(&self) -> Ancestors { + Ancestors(self.parent()) + } + + /// Return an iterator of references to this node and the siblings before it. + #[inline] + pub fn inclusive_preceding_siblings(&self) -> Rev { + match self.parent() { + Some(parent) => { + let first_sibling = parent.first_child().unwrap(); + debug_assert!(self.previous_sibling().is_some() || *self == first_sibling); + Siblings(Some(State { + next: first_sibling, + next_back: self.clone(), + })) + } + None => { + debug_assert!(self.previous_sibling().is_none()); + Siblings(Some(State { + next: self.clone(), + next_back: self.clone(), + })) + } + } + .rev() + } + + /// Return an iterator of references to this node’s siblings before it. + #[inline] + pub fn preceding_siblings(&self) -> Rev { + match (self.parent(), self.previous_sibling()) { + (Some(parent), Some(previous_sibling)) => { + let first_sibling = parent.first_child().unwrap(); + Siblings(Some(State { + next: first_sibling, + next_back: previous_sibling, + })) + } + _ => Siblings(None), + } + .rev() + } + + /// Return an iterator of references to this node and the siblings after it. + #[inline] + pub fn inclusive_following_siblings(&self) -> Siblings { + match self.parent() { + Some(parent) => { + let last_sibling = parent.last_child().unwrap(); + debug_assert!(self.next_sibling().is_some() || *self == last_sibling); + Siblings(Some(State { + next: self.clone(), + next_back: last_sibling, + })) + } + None => { + debug_assert!(self.next_sibling().is_none()); + Siblings(Some(State { + next: self.clone(), + next_back: self.clone(), + })) + } + } + } + + /// Return an iterator of references to this node’s siblings after it. + #[inline] + pub fn following_siblings(&self) -> Siblings { + match (self.parent(), self.next_sibling()) { + (Some(parent), Some(next_sibling)) => { + let last_sibling = parent.last_child().unwrap(); + Siblings(Some(State { + next: next_sibling, + next_back: last_sibling, + })) + } + _ => Siblings(None), + } + } + + /// Return an iterator of references to this node’s children. + #[inline] + pub fn children(&self) -> Siblings { + match (self.first_child(), self.last_child()) { + (Some(first_child), Some(last_child)) => Siblings(Some(State { + next: first_child, + next_back: last_child, + })), + (None, None) => Siblings(None), + _ => unreachable!(), + } + } + + /// Return an iterator of references to this node and its descendants, in tree order. + /// + /// Parent nodes appear before the descendants. + /// + /// Note: this is the `NodeEdge::Start` items from `traverse()`. + #[inline] + pub fn inclusive_descendants(&self) -> Descendants { + Descendants(self.traverse_inclusive()) + } + + /// Return an iterator of references to this node’s descendants, in tree order. + /// + /// Parent nodes appear before the descendants. + /// + /// Note: this is the `NodeEdge::Start` items from `traverse()`. + #[inline] + pub fn descendants(&self) -> Descendants { + Descendants(self.traverse()) + } + + /// Return an iterator of the start and end edges of this node and its descendants, + /// in tree order. + #[inline] + pub fn traverse_inclusive(&self) -> Traverse { + Traverse(Some(State { + next: NodeEdge::Start(self.clone()), + next_back: NodeEdge::End(self.clone()), + })) + } + + /// Return an iterator of the start and end edges of this node’s descendants, + /// in tree order. + #[inline] + pub fn traverse(&self) -> Traverse { + match (self.first_child(), self.last_child()) { + (Some(first_child), Some(last_child)) => Traverse(Some(State { + next: NodeEdge::Start(first_child), + next_back: NodeEdge::End(last_child), + })), + (None, None) => Traverse(None), + _ => unreachable!(), + } + } + + /// Return an iterator of the inclusive descendants element that match the given selector list. + #[inline] + pub fn select(&self, selectors: &str) -> Result>, ()> { + self.inclusive_descendants().select(selectors) + } + + /// Return the first inclusive descendants element that match the given selector list. + #[inline] + pub fn select_first(&self, selectors: &str) -> Result, ()> { + let mut elements = self.select(selectors)?; + elements.next().ok_or(()) + } +} + +#[derive(Debug, Clone)] +struct State { + next: T, + next_back: T, +} + +/// A double-ended iterator of sibling nodes. +#[derive(Debug, Clone)] +pub struct Siblings(Option>); + +macro_rules! siblings_next { + ($next: ident, $next_back: ident, $next_sibling: ident) => { + fn $next(&mut self) -> Option { + #![allow(non_shorthand_field_patterns)] + self.0.take().map(|State { $next: next, $next_back: next_back }| { + if let Some(sibling) = next.$next_sibling() { + if next != next_back { + self.0 = Some(State { $next: sibling, $next_back: next_back }) + } + } + next + }) + } + } +} + +impl Iterator for Siblings { + type Item = NodeRef; + siblings_next!(next, next_back, next_sibling); +} + +impl DoubleEndedIterator for Siblings { + siblings_next!(next_back, next, previous_sibling); +} + +/// An iterator on ancestor nodes. +#[derive(Debug, Clone)] +pub struct Ancestors(Option); + +impl Iterator for Ancestors { + type Item = NodeRef; + + #[inline] + fn next(&mut self) -> Option { + self.0.take().map(|node| { + self.0 = node.parent(); + node + }) + } +} + +/// An iterator of references to a given node and its descendants, in tree order. +#[derive(Debug, Clone)] +pub struct Descendants(Traverse); + +macro_rules! descendants_next { + ($next: ident) => { + #[inline] + fn $next(&mut self) -> Option { + loop { + match (self.0).$next() { + Some(NodeEdge::Start(node)) => return Some(node), + Some(NodeEdge::End(_)) => {} + None => return None + } + } + } + } +} + +impl Iterator for Descendants { + type Item = NodeRef; + descendants_next!(next); +} + +impl DoubleEndedIterator for Descendants { + descendants_next!(next_back); +} + +/// Marks either the start or the end of a node. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum NodeEdge { + /// Indicates that start of a node that has children. + /// Yielded by `Traverse::next` before the node’s descendants. + /// In HTML or XML, this corresponds to an opening tag like `
` + Start(T), + + /// Indicates that end of a node that has children. + /// Yielded by `Traverse::next` after the node’s descendants. + /// In HTML or XML, this corresponds to a closing tag like `
` + End(T), +} + +/// An iterator of the start and end edges of the nodes in a given subtree. +#[derive(Debug, Clone)] +pub struct Traverse(Option>>); + +macro_rules! traverse_next { + ($next: ident, $next_back: ident, $first_child: ident, $next_sibling: ident, $Start: ident, $End: ident) => { + fn $next(&mut self) -> Option> { + #![allow(non_shorthand_field_patterns)] + self.0.take().map(|State { $next: next, $next_back: next_back }| { + if next != next_back { + self.0 = match next { + NodeEdge::$Start(ref node) => { + match node.$first_child() { + Some(child) => { + Some(State { $next: NodeEdge::$Start(child), $next_back: next_back }) + } + None => Some(State { $next: NodeEdge::$End(node.clone()), $next_back: next_back }) + } + } + NodeEdge::$End(ref node) => { + match node.$next_sibling() { + Some(sibling) => { + Some(State { $next: NodeEdge::$Start(sibling), $next_back: next_back }) + } + None => node.parent().map(|parent| { + State { $next: NodeEdge::$End(parent), $next_back: next_back } + }) + } + } + }; + } + next + }) + } + } +} + +impl Iterator for Traverse { + type Item = NodeEdge; + traverse_next!(next, next_back, first_child, next_sibling, Start, End); +} + +impl DoubleEndedIterator for Traverse { + traverse_next!(next_back, next, last_child, previous_sibling, End, Start); +} + +macro_rules! filter_map_like_iterator { + (#[$doc: meta] $name: ident: $f: expr, $from: ty => $to: ty) => { + #[$doc] + #[derive(Debug, Clone)] + pub struct $name(pub I); + + impl Iterator for $name + where + I: Iterator, + { + type Item = $to; + + #[inline] + fn next(&mut self) -> Option<$to> { + for x in self.0.by_ref() { + if let Some(y) = ($f)(x) { + return Some(y); + } + } + None + } + } + + impl DoubleEndedIterator for $name + where + I: DoubleEndedIterator, + { + #[inline] + fn next_back(&mut self) -> Option<$to> { + for x in self.0.by_ref().rev() { + if let Some(y) = ($f)(x) { + return Some(y); + } + } + None + } + } + }; +} + +filter_map_like_iterator! { + /// A node iterator adaptor that yields element nodes. + Elements: NodeRef::into_element_ref, NodeRef => NodeDataRef +} + +filter_map_like_iterator! { + /// A node iterator adaptor that yields comment nodes. + Comments: NodeRef::into_comment_ref, NodeRef => NodeDataRef> +} + +filter_map_like_iterator! { + /// A node iterator adaptor that yields text nodes. + TextNodes: NodeRef::into_text_ref, NodeRef => NodeDataRef> +} + +/// An element iterator adaptor that yields elements maching given selectors. +pub struct Select +where + I: Iterator>, + S: Borrow, +{ + /// The underlying iterator. + pub iter: I, + + /// The selectors to be matched. + pub selectors: S, +} + +impl Iterator for Select +where + I: Iterator>, + S: Borrow, +{ + type Item = NodeDataRef; + + #[inline] + fn next(&mut self) -> Option> { + for element in self.iter.by_ref() { + if self.selectors.borrow().matches(&element) { + return Some(element); + } + } + None + } +} + +impl DoubleEndedIterator for Select +where + I: DoubleEndedIterator>, + S: Borrow, +{ + #[inline] + fn next_back(&mut self) -> Option> { + for element in self.iter.by_ref().rev() { + if self.selectors.borrow().matches(&element) { + return Some(element); + } + } + None + } +} + +/// Convenience methods for node iterators. +pub trait NodeIterator: Sized + Iterator { + /// Filter this element iterator to elements. + #[inline] + fn elements(self) -> Elements { + Elements(self) + } + + /// Filter this node iterator to text nodes. + #[inline] + fn text_nodes(self) -> TextNodes { + TextNodes(self) + } + + /// Filter this node iterator to comment nodes. + #[inline] + fn comments(self) -> Comments { + Comments(self) + } + + /// Filter this node iterator to elements maching the given selectors. + #[inline] + fn select(self, selectors: &str) -> Result>, ()> { + self.elements().select(selectors) + } +} + +/// Convenience methods for element iterators. +pub trait ElementIterator: Sized + Iterator> { + /// Filter this element iterator to elements maching the given selectors. + #[inline] + fn select(self, selectors: &str) -> Result, ()> { + Selectors::compile(selectors).map(|s| Select { + iter: self, + selectors: s, + }) + } +} + +impl NodeIterator for I where I: Iterator {} +impl ElementIterator for I where I: Iterator> {} diff --git a/kuchiki/src/lib.rs b/kuchiki/src/lib.rs new file mode 100644 index 0000000..2c862d9 --- /dev/null +++ b/kuchiki/src/lib.rs @@ -0,0 +1,40 @@ +/*! + +Kuchiki (朽木), a HTML/XML tree manipulation library for Rust. + +*/ + +#![deny(missing_docs)] + +#[macro_use] +extern crate html5ever; +#[macro_use] +extern crate matches; + +mod attributes; +mod cell_extras; +pub mod iter; +mod node_data_ref; +mod parser; +mod select; +mod serializer; +#[cfg(test)] +mod tests; +mod tree; + +pub use attributes::{Attribute, Attributes, ExpandedName}; +pub use node_data_ref::NodeDataRef; +pub use parser::{parse_html, parse_html_with_options, parse_fragment, ParseOpts, Sink}; +pub use select::{Selector, Selectors, Specificity}; +pub use tree::{Doctype, DocumentData, ElementData, Node, NodeData, NodeRef}; + +/// This module re-exports a number of traits that are useful when using Kuchiki. +/// It can be used with: +/// +/// ```rust +/// use kuchiki::traits::*; +/// ``` +pub mod traits { + pub use html5ever::tendril::TendrilSink; + pub use crate::iter::{ElementIterator, NodeIterator}; +} diff --git a/kuchiki/src/node_data_ref.rs b/kuchiki/src/node_data_ref.rs new file mode 100644 index 0000000..2cfd8b8 --- /dev/null +++ b/kuchiki/src/node_data_ref.rs @@ -0,0 +1,116 @@ +use std::cell::RefCell; +use std::fmt; +use std::ops::Deref; +use crate::tree::{Doctype, DocumentData, ElementData, Node, NodeRef}; + +impl NodeRef { + /// If this node is an element, return a strong reference to element-specific data. + #[inline] + pub fn into_element_ref(self) -> Option> { + NodeDataRef::new_opt(self, Node::as_element) + } + + /// If this node is a text node, return a strong reference to its contents. + #[inline] + pub fn into_text_ref(self) -> Option>> { + NodeDataRef::new_opt(self, Node::as_text) + } + + /// If this node is a comment, return a strong reference to its contents. + #[inline] + pub fn into_comment_ref(self) -> Option>> { + NodeDataRef::new_opt(self, Node::as_comment) + } + + /// If this node is a doctype, return a strong reference to doctype-specific data. + #[inline] + pub fn into_doctype_ref(self) -> Option> { + NodeDataRef::new_opt(self, Node::as_doctype) + } + + /// If this node is a document, return a strong reference to document-specific data. + #[inline] + pub fn into_document_ref(self) -> Option> { + NodeDataRef::new_opt(self, Node::as_document) + } +} + +/// Holds a strong reference to a node, but dereferences to some component inside of it. +#[derive(Eq)] +pub struct NodeDataRef { + _keep_alive: NodeRef, + _reference: *const T, +} + +impl NodeDataRef { + /// Create a `NodeDataRef` for a component in a given node. + #[inline] + pub fn new(rc: NodeRef, f: F) -> NodeDataRef + where + F: FnOnce(&Node) -> &T, + { + NodeDataRef { + _reference: f(&*rc), + _keep_alive: rc, + } + } + + /// Create a `NodeDataRef` for and a component that may or may not be in a given node. + #[inline] + pub fn new_opt(rc: NodeRef, f: F) -> Option> + where + F: FnOnce(&Node) -> Option<&T>, + { + f(&*rc).map(|r| r as *const T).map(move |r| NodeDataRef { + _reference: r, + _keep_alive: rc, + }) + } + + /// Access the corresponding node. + #[inline] + pub fn as_node(&self) -> &NodeRef { + &self._keep_alive + } +} + +impl Deref for NodeDataRef { + type Target = T; + #[inline] + fn deref(&self) -> &T { + unsafe { &*self._reference } + } +} + +// #[derive(PartialEq)] would compare both fields +impl PartialEq for NodeDataRef { + #[inline] + fn eq(&self, other: &Self) -> bool { + self._keep_alive == other._keep_alive + } +} + +// #[derive(Clone)] would have an unnecessary `T: Clone` bound +impl Clone for NodeDataRef { + #[inline] + fn clone(&self) -> Self { + NodeDataRef { + _keep_alive: self._keep_alive.clone(), + _reference: self._reference, + } + } +} + +impl fmt::Debug for NodeDataRef { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + fmt::Debug::fmt(&**self, f) + } +} + +impl NodeDataRef { + /// Return the concatenation of all text nodes in this subtree. + pub fn text_contents(&self) -> String { + self.as_node().text_contents() + } +} diff --git a/kuchiki/src/parser.rs b/kuchiki/src/parser.rs new file mode 100644 index 0000000..745f6ac --- /dev/null +++ b/kuchiki/src/parser.rs @@ -0,0 +1,241 @@ +use html5ever::tendril::StrTendril; +use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; +use html5ever::{self, Attribute, ExpandedName, QualName}; +use std::borrow::Cow; + +use crate::attributes; +use crate::tree::NodeRef; + +/// Options for the HTML parser. +#[derive(Default)] +pub struct ParseOpts { + /// Options for the HTML tokenizer. + pub tokenizer: html5ever::tokenizer::TokenizerOpts, + + /// Options for the HTML tree builder. + pub tree_builder: html5ever::tree_builder::TreeBuilderOpts, + + /// A callback for HTML parse errors (which are never fatal). + pub on_parse_error: Option)>>, +} + +/// Parse an HTML document with html5ever and the default configuration. +pub fn parse_html() -> html5ever::Parser { + parse_html_with_options(ParseOpts::default()) +} + +/// Parse an HTML document with html5ever with custom configuration. +pub fn parse_html_with_options(opts: ParseOpts) -> html5ever::Parser { + let sink = Sink { + document_node: NodeRef::new_document(), + on_parse_error: opts.on_parse_error, + }; + let html5opts = html5ever::ParseOpts { + tokenizer: opts.tokenizer, + tree_builder: opts.tree_builder, + }; + html5ever::parse_document(sink, html5opts) +} + +/// Parse an HTML fragment with html5ever and the default configuration. +pub fn parse_fragment(ctx_name: QualName, ctx_attr: Vec) -> html5ever::Parser { + parse_fragment_with_options(ParseOpts::default(), ctx_name, ctx_attr) +} + +/// Parse an HTML fragment with html5ever with custom configuration. +pub fn parse_fragment_with_options(opts: ParseOpts, ctx_name: QualName, ctx_attr: Vec) -> html5ever::Parser { + let sink = Sink { + document_node: NodeRef::new_document(), + on_parse_error: opts.on_parse_error, + }; + let html5opts = html5ever::ParseOpts { + tokenizer: opts.tokenizer, + tree_builder: opts.tree_builder, + }; + html5ever::parse_fragment(sink, html5opts, ctx_name, ctx_attr) +} + +/// Receives new tree nodes during parsing. +pub struct Sink { + document_node: NodeRef, + on_parse_error: Option)>>, +} + +impl TreeSink for Sink { + type Output = NodeRef; + + fn finish(self) -> NodeRef { + self.document_node + } + + type Handle = NodeRef; + + #[inline] + fn parse_error(&mut self, message: Cow<'static, str>) { + if let Some(ref mut handler) = self.on_parse_error { + handler(message) + } + } + + #[inline] + fn get_document(&mut self) -> NodeRef { + self.document_node.clone() + } + + #[inline] + fn set_quirks_mode(&mut self, mode: QuirksMode) { + self.document_node + .as_document() + .unwrap() + ._quirks_mode + .set(mode) + } + + #[inline] + fn same_node(&self, x: &NodeRef, y: &NodeRef) -> bool { + x == y + } + + #[inline] + fn elem_name<'a>(&self, target: &'a NodeRef) -> ExpandedName<'a> { + target.as_element().unwrap().name.expanded() + } + + #[inline] + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + _flags: ElementFlags, + ) -> NodeRef { + NodeRef::new_element( + name, + attrs.into_iter().map(|attr| { + let Attribute { + name: QualName { prefix, ns, local }, + value, + } = attr; + let value = String::from(value); + ( + attributes::ExpandedName { ns, local }, + attributes::Attribute { prefix, value }, + ) + }), + ) + } + + #[inline] + fn create_comment(&mut self, text: StrTendril) -> NodeRef { + NodeRef::new_comment(text) + } + + #[inline] + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> NodeRef { + NodeRef::new_processing_instruction(target, data) + } + + #[inline] + fn append(&mut self, parent: &NodeRef, child: NodeOrText) { + match child { + NodeOrText::AppendNode(node) => parent.append(node), + NodeOrText::AppendText(text) => { + if let Some(last_child) = parent.last_child() { + if let Some(existing) = last_child.as_text() { + existing.borrow_mut().push_str(&text); + return; + } + } + parent.append(NodeRef::new_text(text)) + } + } + } + + #[inline] + fn append_before_sibling(&mut self, sibling: &NodeRef, child: NodeOrText) { + match child { + NodeOrText::AppendNode(node) => sibling.insert_before(node), + NodeOrText::AppendText(text) => { + if let Some(previous_sibling) = sibling.previous_sibling() { + if let Some(existing) = previous_sibling.as_text() { + existing.borrow_mut().push_str(&text); + return; + } + } + sibling.insert_before(NodeRef::new_text(text)) + } + } + } + + #[inline] + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + self.document_node + .append(NodeRef::new_doctype(name, public_id, system_id)) + } + + #[inline] + fn add_attrs_if_missing(&mut self, target: &NodeRef, attrs: Vec) { + let element = target.as_element().unwrap(); + let mut attributes = element.attributes.borrow_mut(); + + for Attribute { + name: QualName { prefix, ns, local }, + value, + } in attrs + { + attributes + .map + .entry(attributes::ExpandedName { ns, local }) + .or_insert_with(|| { + let value = String::from(value); + attributes::Attribute { prefix, value } + }); + } + } + + #[inline] + fn remove_from_parent(&mut self, target: &NodeRef) { + target.detach() + } + + #[inline] + fn reparent_children(&mut self, node: &NodeRef, new_parent: &NodeRef) { + // FIXME: Can this be done more effciently in rctree, + // by moving the whole linked list of children at once? + for child in node.children() { + new_parent.append(child) + } + } + + #[inline] + fn mark_script_already_started(&mut self, _node: &NodeRef) { + // FIXME: Is this useful outside of a browser? + } + + #[inline] + fn get_template_contents(&mut self, target: &NodeRef) -> NodeRef { + target + .as_element() + .unwrap() + .template_contents + .clone() + .unwrap() + } + + fn append_based_on_parent_node( + &mut self, + element: &NodeRef, + prev_element: &NodeRef, + child: NodeOrText, + ) { + if element.parent().is_some() { + self.append_before_sibling(element, child) + } else { + self.append(prev_element, child) + } + } +} diff --git a/kuchiki/src/select.rs b/kuchiki/src/select.rs new file mode 100644 index 0000000..3dea06a --- /dev/null +++ b/kuchiki/src/select.rs @@ -0,0 +1,433 @@ +use crate::attributes::ExpandedName; +use cssparser::{self, CowRcStr, ParseError, SourceLocation, ToCss}; +use html5ever::{LocalName, Namespace}; +use crate::iter::{NodeIterator, Select}; +use crate::node_data_ref::NodeDataRef; +use selectors::attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}; +use selectors::context::QuirksMode; +use selectors::parser::SelectorParseErrorKind; +use selectors::parser::{ + NonTSPseudoClass, Parser, Selector as GenericSelector, SelectorImpl, SelectorList, +}; +use selectors::{self, matching, OpaqueElement}; +use std::fmt; +use crate::tree::{ElementData, Node, NodeData, NodeRef}; + +/// The definition of whitespace per CSS Selectors Level 3 § 4. +/// +/// Copied from rust-selectors. +static SELECTOR_WHITESPACE: &[char] = &[' ', '\t', '\n', '\r', '\x0C']; + +#[derive(Debug, Clone)] +pub struct KuchikiSelectors; + +impl SelectorImpl for KuchikiSelectors { + type AttrValue = String; + type Identifier = LocalName; + type ClassName = LocalName; + type LocalName = LocalName; + type PartName = LocalName; + type NamespacePrefix = LocalName; + type NamespaceUrl = Namespace; + type BorrowedNamespaceUrl = Namespace; + type BorrowedLocalName = LocalName; + + type NonTSPseudoClass = PseudoClass; + type PseudoElement = PseudoElement; + + type ExtraMatchingData = (); +} + +struct KuchikiParser; + +impl<'i> Parser<'i> for KuchikiParser { + type Impl = KuchikiSelectors; + type Error = SelectorParseErrorKind<'i>; + + fn parse_non_ts_pseudo_class( + &self, + location: SourceLocation, + name: CowRcStr<'i>, + ) -> Result>> { + use self::PseudoClass::*; + if name.eq_ignore_ascii_case("any-link") { + Ok(AnyLink) + } else if name.eq_ignore_ascii_case("link") { + Ok(Link) + } else if name.eq_ignore_ascii_case("visited") { + Ok(Visited) + } else if name.eq_ignore_ascii_case("active") { + Ok(Active) + } else if name.eq_ignore_ascii_case("focus") { + Ok(Focus) + } else if name.eq_ignore_ascii_case("hover") { + Ok(Hover) + } else if name.eq_ignore_ascii_case("enabled") { + Ok(Enabled) + } else if name.eq_ignore_ascii_case("disabled") { + Ok(Disabled) + } else if name.eq_ignore_ascii_case("checked") { + Ok(Checked) + } else if name.eq_ignore_ascii_case("indeterminate") { + Ok(Indeterminate) + } else { + Err( + location.new_custom_error(SelectorParseErrorKind::UnsupportedPseudoClassOrElement( + name, + )), + ) + } + } +} + +#[derive(PartialEq, Eq, Clone, Debug, Hash)] +pub enum PseudoClass { + AnyLink, + Link, + Visited, + Active, + Focus, + Hover, + Enabled, + Disabled, + Checked, + Indeterminate, +} + +impl NonTSPseudoClass for PseudoClass { + type Impl = KuchikiSelectors; + + fn is_active_or_hover(&self) -> bool { + matches!(*self, PseudoClass::Active | PseudoClass::Hover) + } + + fn is_user_action_state(&self) -> bool { + matches!(*self, PseudoClass::Active | PseudoClass::Hover | PseudoClass::Focus) + } + + fn has_zero_specificity(&self) -> bool { + false + } +} + +impl ToCss for PseudoClass { + fn to_css(&self, dest: &mut W) -> fmt::Result + where + W: fmt::Write, + { + dest.write_str(match *self { + PseudoClass::AnyLink => ":any-link", + PseudoClass::Link => ":link", + PseudoClass::Visited => ":visited", + PseudoClass::Active => ":active", + PseudoClass::Focus => ":focus", + PseudoClass::Hover => ":hover", + PseudoClass::Enabled => ":enabled", + PseudoClass::Disabled => ":disabled", + PseudoClass::Checked => ":checked", + PseudoClass::Indeterminate => ":indeterminate", + }) + } +} + +#[derive(PartialEq, Eq, Clone, Debug, Hash)] +pub enum PseudoElement {} + +impl ToCss for PseudoElement { + fn to_css(&self, _dest: &mut W) -> fmt::Result + where + W: fmt::Write, + { + match *self {} + } +} + +impl selectors::parser::PseudoElement for PseudoElement { + type Impl = KuchikiSelectors; +} + +impl selectors::Element for NodeDataRef { + type Impl = KuchikiSelectors; + + #[inline] + fn opaque(&self) -> OpaqueElement { + let node: &Node = self.as_node(); + OpaqueElement::new(node) + } + + #[inline] + fn is_html_slot_element(&self) -> bool { + false + } + #[inline] + fn parent_node_is_shadow_root(&self) -> bool { + false + } + #[inline] + fn containing_shadow_host(&self) -> Option { + None + } + + #[inline] + fn parent_element(&self) -> Option { + self.as_node().parent().and_then(NodeRef::into_element_ref) + } + #[inline] + fn prev_sibling_element(&self) -> Option { + self.as_node().preceding_siblings().elements().next() + } + #[inline] + fn next_sibling_element(&self) -> Option { + self.as_node().following_siblings().elements().next() + } + #[inline] + fn is_empty(&self) -> bool { + self.as_node().children().all(|child| match *child.data() { + NodeData::Element(_) => false, + NodeData::Text(ref text) => text.borrow().is_empty(), + _ => true, + }) + } + #[inline] + fn is_root(&self) -> bool { + match self.as_node().parent() { + None => false, + Some(parent) => matches!(*parent.data(), NodeData::Document(_)), + } + } + + #[inline] + fn is_html_element_in_html_document(&self) -> bool { + // FIXME: Have a notion of HTML document v.s. XML document? + self.name.ns == ns!(html) + } + + #[inline] + fn has_local_name(&self, name: &LocalName) -> bool { + self.name.local == *name + } + #[inline] + fn has_namespace(&self, namespace: &Namespace) -> bool { + self.name.ns == *namespace + } + + #[inline] + fn is_part(&self, _name: &LocalName) -> bool { + false + } + + #[inline] + fn exported_part(&self, _: &LocalName) -> Option { + None + } + + #[inline] + fn imported_part(&self, _: &LocalName) -> Option { + None + } + + #[inline] + fn is_pseudo_element(&self) -> bool { + false + } + + #[inline] + fn is_same_type(&self, other: &Self) -> bool { + self.name == other.name + } + + #[inline] + fn is_link(&self) -> bool { + self.name.ns == ns!(html) + && matches!( + self.name.local, + local_name!("a") | local_name!("area") | local_name!("link") + ) + && self + .attributes + .borrow() + .map + .contains_key(&ExpandedName::new(ns!(), local_name!("href"))) + } + + #[inline] + fn has_id(&self, id: &LocalName, case_sensitivity: CaseSensitivity) -> bool { + self.attributes + .borrow() + .get(local_name!("id")) + .map_or(false, |id_attr| { + case_sensitivity.eq(id.as_bytes(), id_attr.as_bytes()) + }) + } + + #[inline] + fn has_class(&self, name: &LocalName, case_sensitivity: CaseSensitivity) -> bool { + let name = name.as_bytes(); + !name.is_empty() + && if let Some(class_attr) = self.attributes.borrow().get(local_name!("class")) { + class_attr + .split(SELECTOR_WHITESPACE) + .any(|class| case_sensitivity.eq(class.as_bytes(), name)) + } else { + false + } + } + + #[inline] + fn attr_matches( + &self, + ns: &NamespaceConstraint<&Namespace>, + local_name: &LocalName, + operation: &AttrSelectorOperation<&String>, + ) -> bool { + let attrs = self.attributes.borrow(); + match *ns { + NamespaceConstraint::Any => attrs + .map + .iter() + .any(|(name, attr)| name.local == *local_name && operation.eval_str(&attr.value)), + NamespaceConstraint::Specific(ns_url) => attrs + .map + .get(&ExpandedName::new(ns_url, local_name.clone())) + .map_or(false, |attr| operation.eval_str(&attr.value)), + } + } + + fn match_pseudo_element( + &self, + pseudo: &PseudoElement, + _context: &mut matching::MatchingContext, + ) -> bool { + match *pseudo {} + } + + fn match_non_ts_pseudo_class( + &self, + pseudo: &PseudoClass, + _context: &mut matching::MatchingContext, + _flags_setter: &mut F, + ) -> bool + where + F: FnMut(&Self, matching::ElementSelectorFlags), + { + use self::PseudoClass::*; + match *pseudo { + Active | Focus | Hover | Enabled | Disabled | Checked | Indeterminate | Visited => { + false + } + AnyLink | Link => { + self.name.ns == ns!(html) + && matches!( + self.name.local, + local_name!("a") | local_name!("area") | local_name!("link") + ) + && self.attributes.borrow().contains(local_name!("href")) + } + } + } +} + +/// A pre-compiled list of CSS Selectors. +pub struct Selectors(pub Vec); + +/// A pre-compiled CSS Selector. +pub struct Selector(GenericSelector); + +/// The specificity of a selector. +/// +/// Opaque, but ordered. +/// +/// Determines precedence in the cascading algorithm. +/// When equal, a rule later in source order takes precedence. +#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)] +pub struct Specificity(u32); + +impl Selectors { + /// Compile a list of selectors. This may fail on syntax errors or unsupported selectors. + #[inline] + pub fn compile(s: &str) -> Result { + let mut input = cssparser::ParserInput::new(s); + match SelectorList::parse(&KuchikiParser, &mut cssparser::Parser::new(&mut input)) { + Ok(list) => Ok(Selectors(list.0.into_iter().map(Selector).collect())), + Err(_) => Err(()), + } + } + + /// Returns whether the given element matches this list of selectors. + #[inline] + pub fn matches(&self, element: &NodeDataRef) -> bool { + self.0.iter().any(|s| s.matches(element)) + } + + /// Filter an element iterator, yielding those matching this list of selectors. + #[inline] + pub fn filter(&self, iter: I) -> Select + where + I: Iterator>, + { + Select { + iter, + selectors: self, + } + } +} + +impl Selector { + /// Returns whether the given element matches this selector. + #[inline] + pub fn matches(&self, element: &NodeDataRef) -> bool { + let mut context = matching::MatchingContext::new( + matching::MatchingMode::Normal, + None, + None, + QuirksMode::NoQuirks, + ); + matching::matches_selector(&self.0, 0, None, element, &mut context, &mut |_, _| {}) + } + + /// Return the specificity of this selector. + pub fn specificity(&self) -> Specificity { + Specificity(self.0.specificity()) + } +} + +impl ::std::str::FromStr for Selectors { + type Err = (); + #[inline] + fn from_str(s: &str) -> Result { + Selectors::compile(s) + } +} + +impl fmt::Display for Selector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.to_css(f) + } +} + +impl fmt::Display for Selectors { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut iter = self.0.iter(); + let first = iter + .next() + .expect("Empty Selectors, should contain at least one selector"); + first.0.to_css(f)?; + for selector in iter { + f.write_str(", ")?; + selector.0.to_css(f)?; + } + Ok(()) + } +} + +impl fmt::Debug for Selector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +impl fmt::Debug for Selectors { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} diff --git a/kuchiki/src/serializer.rs b/kuchiki/src/serializer.rs new file mode 100644 index 0000000..4b4936c --- /dev/null +++ b/kuchiki/src/serializer.rs @@ -0,0 +1,105 @@ +use html5ever::serialize::TraversalScope::*; +use html5ever::serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope}; +use html5ever::QualName; +use std::fs::File; +use std::io::{Result, Write}; +use std::path::Path; +use std::string::ToString; + +use crate::tree::{NodeData, NodeRef}; + +impl Serialize for NodeRef { + fn serialize( + &self, + serializer: &mut S, + traversal_scope: TraversalScope, + ) -> Result<()> { + match (traversal_scope, self.data()) { + (ref scope, &NodeData::Element(ref element)) => { + if *scope == IncludeNode { + let attrs = element.attributes.borrow(); + + // Unfortunately we need to allocate something to hold these &'a QualName + let attrs = attrs + .map + .iter() + .map(|(name, attr)| { + ( + QualName::new( + attr.prefix.clone(), + name.ns.clone(), + name.local.clone(), + ), + &attr.value, + ) + }) + .collect::>(); + + serializer.start_elem( + element.name.clone(), + attrs.iter().map(|&(ref name, value)| (name, &**value)), + )? + } + + for child in self.children() { + Serialize::serialize(&child, serializer, IncludeNode)? + } + + if *scope == IncludeNode { + serializer.end_elem(element.name.clone())? + } + Ok(()) + } + + (_, &NodeData::DocumentFragment) | (_, &NodeData::Document(_)) => { + for child in self.children() { + Serialize::serialize(&child, serializer, IncludeNode)? + } + Ok(()) + } + + (ChildrenOnly(_), _) => Ok(()), + + (IncludeNode, &NodeData::Doctype(ref doctype)) => { + serializer.write_doctype(&doctype.name) + } + (IncludeNode, &NodeData::Text(ref text)) => serializer.write_text(&text.borrow()), + (IncludeNode, &NodeData::Comment(ref text)) => serializer.write_comment(&text.borrow()), + (IncludeNode, &NodeData::ProcessingInstruction(ref contents)) => { + let contents = contents.borrow(); + serializer.write_processing_instruction(&contents.0, &contents.1) + } + } + } +} + +impl ToString for NodeRef { + #[inline] + fn to_string(&self) -> String { + let mut u8_vec = Vec::new(); + self.serialize(&mut u8_vec).unwrap(); + String::from_utf8(u8_vec).unwrap() + } +} + +impl NodeRef { + /// Serialize this node and its descendants in HTML syntax to the given stream. + #[inline] + pub fn serialize(&self, writer: &mut W) -> Result<()> { + serialize( + writer, + self, + SerializeOpts { + traversal_scope: IncludeNode, + ..Default::default() + }, + ) + } + + /// Serialize this node and its descendants in HTML syntax to a new file at the given path. + #[inline] + pub fn serialize_to_file>(&self, path: P) -> Result<()> { + let mut file = File::create(&path)?; + self.serialize(&mut file) + } +} diff --git a/kuchiki/src/tests.rs b/kuchiki/src/tests.rs new file mode 100644 index 0000000..1ccc1b2 --- /dev/null +++ b/kuchiki/src/tests.rs @@ -0,0 +1,185 @@ +use html5ever::tree_builder::QuirksMode; +use html5ever::QualName; +use std::path::Path; + +use tempfile::TempDir; + +use crate::parser::{parse_html, parse_fragment}; +use crate::select::*; +use crate::traits::*; + +#[test] +fn text_nodes() { + let html = r" + +Test case +

Content contains Important data

"; + let document = parse_html().one(html); + let paragraph = document.select("p").unwrap().collect::>(); + assert_eq!(paragraph.len(), 1); + assert_eq!( + paragraph[0].text_contents(), + "Content contains Important data" + ); + let texts = paragraph[0] + .as_node() + .descendants() + .text_nodes() + .collect::>(); + assert_eq!(texts.len(), 3); + assert_eq!(&*texts[0].borrow(), "Content contains "); + assert_eq!(&*texts[1].borrow(), "Important"); + assert_eq!(&*texts[2].borrow(), " data"); + { + let mut x = texts[0].borrow_mut(); + x.truncate(0); + x.push_str("Content doesn't contain "); + } + assert_eq!(&*texts[0].borrow(), "Content doesn't contain "); +} + +#[test] +fn parse_and_serialize() { + let html = r" + +Test case +

Content"; + let document = parse_html().one(html); + assert_eq!( + document.as_document().unwrap().quirks_mode(), + QuirksMode::NoQuirks + ); + assert_eq!( + document.to_string(), + r"Test case +

Content

" + ); +} + +#[test] +fn parse_and_serialize_fragment() { + let html = r"Test case"; + + let ctx_name = QualName::new(None, ns!(html), local_name!("tbody")); + let document = parse_fragment(ctx_name, vec![]).one(html); + assert_eq!(document.as_document().unwrap().quirks_mode(), QuirksMode::NoQuirks); + assert_eq!(document.to_string(), r"Test case"); +} + +#[test] +fn parse_file() { + let mut path = Path::new(env!("CARGO_MANIFEST_DIR")).to_path_buf(); + path.push("test_data".to_string()); + path.push("foo.html"); + + let html = r" + Test case + + +

Foo

+ + +"; + let document = parse_html().from_utf8().from_file(&path).unwrap(); + assert_eq!(document.to_string(), html); +} + +#[test] +fn serialize_and_read_file() { + let tempdir = TempDir::new().unwrap(); + let mut path = tempdir.path().to_path_buf(); + path.push("temp.html"); + + let html = r"TitleBody"; + let document = parse_html().one(html); + let _ = document.serialize_to_file(path.clone()); + + let document2 = parse_html().from_utf8().from_file(&path).unwrap(); + assert_eq!(document.to_string(), document2.to_string()); +} + +#[test] +fn select() { + let html = r" +Test case +

Foo +

Bar +

Foo +"; + + let document = parse_html().one(html); + let matching = document.select("p.foo").unwrap().collect::>(); + assert_eq!(matching.len(), 2); + let child = matching[0].as_node().first_child().unwrap(); + assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n"); + assert_eq!(matching[0].attributes.borrow().get("class"), Some("foo")); + assert_eq!( + matching[0].attributes.borrow().get(local_name!("class")), + Some("foo") + ); + + let selectors = Selectors::compile("p.foo").unwrap(); + let matching2 = selectors + .filter(document.descendants().elements()) + .collect::>(); + assert_eq!(matching, matching2); +} + +#[test] +fn select_first() { + let html = r" +Test case +

Foo +

Bar +

Baz +"; + + let document = parse_html().one(html); + let matching = document.select_first("p.foo").unwrap(); + let child = matching.as_node().first_child().unwrap(); + assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n"); + assert_eq!(matching.attributes.borrow().get("class"), Some("foo")); + assert_eq!( + matching.attributes.borrow().get(local_name!("class")), + Some("foo") + ); + + assert!(document.select_first("p.bar").is_err()); +} + +#[test] +fn to_string() { + let html = r" + + + Test case + + +

Foo + +"; + + let document = parse_html().one(html); + assert_eq!( + document + .inclusive_descendants() + .nth(11) + .unwrap() + .to_string(), + "

Foo\n \n

" + ); +} + +#[test] +fn specificity() { + let selectors = Selectors::compile(".example, :first-child, div").unwrap(); + let specificities = selectors + .0 + .iter() + .map(|s| s.specificity()) + .collect::>(); + assert_eq!(specificities.len(), 3); + assert!(specificities[0] == specificities[1]); + assert!(specificities[0] > specificities[2]); + assert!(specificities[1] > specificities[2]); +} diff --git a/kuchiki/src/tree.rs b/kuchiki/src/tree.rs new file mode 100644 index 0000000..92483a2 --- /dev/null +++ b/kuchiki/src/tree.rs @@ -0,0 +1,489 @@ +use html5ever::tree_builder::QuirksMode; +use html5ever::QualName; +use std::cell::{Cell, RefCell}; +use std::fmt; +use std::ops::Deref; +use std::rc::{Rc, Weak}; + +use crate::attributes::{Attribute, Attributes, ExpandedName}; +use crate::cell_extras::*; +use crate::iter::NodeIterator; + +/// Node data specific to the node type. +#[derive(Debug, PartialEq, Clone)] +pub enum NodeData { + /// Element node + Element(ElementData), + + /// Text node + Text(RefCell), + + /// Comment node + Comment(RefCell), + + /// Processing instruction node + ProcessingInstruction(RefCell<(String, String)>), + + /// Doctype node + Doctype(Doctype), + + /// Document node + Document(DocumentData), + + /// Document fragment node + DocumentFragment, +} + +/// Data specific to doctype nodes. +#[derive(Debug, PartialEq, Clone)] +pub struct Doctype { + /// The name of the doctype + pub name: String, + + /// The public ID of the doctype + pub public_id: String, + + /// The system ID of the doctype + pub system_id: String, +} + +/// Data specific to element nodes. +#[derive(Debug, PartialEq, Clone)] +pub struct ElementData { + /// The namespace and local name of the element, such as `ns!(html)` and `body`. + pub name: QualName, + + /// The attributes of the elements. + pub attributes: RefCell, + + /// If the element is an HTML `