From c836f88ff2a7f94ee7427ae04b91e702a31ab52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Garc=C3=ADa?= Date: Sun, 7 Feb 2021 22:28:02 +0100 Subject: [PATCH] Remove soup and use a newer html5ever directly --- Cargo.lock | 209 ++++++++++++++--------------------------------- Cargo.toml | 3 +- src/api/icons.rs | 71 ++++++++++------ 3 files changed, 107 insertions(+), 176 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 68994d43..9d7ddfcc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -35,12 +35,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "autocfg" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" - [[package]] name = "autocfg" version = "1.0.1" @@ -134,11 +128,13 @@ dependencies = [ "dotenv", "fern", "handlebars", + "html5ever", "idna 0.2.1", "jsonwebtoken", "lettre", "libsqlite3-sys", "log 0.4.14", + "markup5ever_rcdom", "multipart", "newline-converter", "num-derive", @@ -159,7 +155,6 @@ dependencies = [ "rocket_contrib", "serde", "serde_json", - "soup", "syslog", "time 0.2.25", "u2f", @@ -298,15 +293,6 @@ dependencies = [ "parse-zoneinfo", ] -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags", -] - [[package]] name = "const_fn" version = "0.4.5" @@ -847,16 +833,16 @@ dependencies = [ [[package]] name = "html5ever" -version = "0.22.5" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c213fa6a618dc1da552f54f85cba74b05d8e883c92ec4e89067736938084c26e" +checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" dependencies = [ "log 0.4.14", "mac", "markup5ever", - "proc-macro2 0.4.30", - "quote 0.6.13", - "syn 0.15.44", + "proc-macro2 1.0.24", + "quote 1.0.8", + "syn 1.0.60", ] [[package]] @@ -1005,7 +991,7 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" dependencies = [ - "autocfg 1.0.1", + "autocfg", "hashbrown", ] @@ -1170,10 +1156,11 @@ checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" [[package]] name = "markup5ever" -version = "0.7.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897636f9850c3eef4905a5540683ed53dc9393860f0846cab2c2ddf9939862ff" +checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" dependencies = [ + "log 0.4.14", "phf", "phf_codegen", "serde", @@ -1184,6 +1171,18 @@ dependencies = [ "tendril", ] +[[package]] +name = "markup5ever_rcdom" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -1261,7 +1260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d" dependencies = [ "adler", - "autocfg 1.0.1", + "autocfg", ] [[package]] @@ -1431,7 +1430,7 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" dependencies = [ - "autocfg 1.0.1", + "autocfg", "num-integer", "num-traits", ] @@ -1453,7 +1452,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" dependencies = [ - "autocfg 1.0.1", + "autocfg", "num-traits", ] @@ -1463,7 +1462,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ - "autocfg 1.0.1", + "autocfg", ] [[package]] @@ -1548,7 +1547,7 @@ version = "0.9.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "921fc71883267538946025deffb622905ecad223c28efbfdef9bb59a0175f3e6" dependencies = [ - "autocfg 1.0.1", + "autocfg", "cc", "libc", "openssl-src", @@ -1735,18 +1734,18 @@ dependencies = [ [[package]] name = "phf" -version = "0.7.24" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.7.24" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" dependencies = [ "phf_generator", "phf_shared", @@ -1754,19 +1753,19 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.7.24" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" dependencies = [ "phf_shared", - "rand 0.6.5", + "rand 0.7.3", ] [[package]] name = "phf_shared" -version = "0.7.24" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" dependencies = [ "siphasher", ] @@ -1952,25 +1951,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -dependencies = [ - "autocfg 0.1.7", - "libc", - "rand_chacha 0.1.1", - "rand_core 0.4.2", - "rand_hc 0.1.0", - "rand_isaac", - "rand_jitter", - "rand_os", - "rand_pcg", - "rand_xorshift", - "winapi 0.3.9", -] - [[package]] name = "rand" version = "0.7.3" @@ -1982,6 +1962,7 @@ dependencies = [ "rand_chacha 0.2.2", "rand_core 0.5.1", "rand_hc 0.2.0", + "rand_pcg", ] [[package]] @@ -1996,16 +1977,6 @@ dependencies = [ "rand_hc 0.3.0", ] -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -dependencies = [ - "autocfg 0.1.7", - "rand_core 0.3.1", -] - [[package]] name = "rand_chacha" version = "0.2.2" @@ -2059,15 +2030,6 @@ dependencies = [ "getrandom 0.2.2", ] -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "rand_hc" version = "0.2.0" @@ -2086,57 +2048,13 @@ dependencies = [ "rand_core 0.6.1", ] -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_jitter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" -dependencies = [ - "libc", - "rand_core 0.4.2", - "winapi 0.3.9", -] - -[[package]] -name = "rand_os" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" -dependencies = [ - "cloudabi", - "fuchsia-cprng", - "libc", - "rand_core 0.4.2", - "rdrand", - "winapi 0.3.9", -] - [[package]] name = "rand_pcg" -version = "0.1.2" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" dependencies = [ - "autocfg 0.1.7", - "rand_core 0.4.2", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -dependencies = [ - "rand_core 0.3.1", + "rand_core 0.5.1", ] [[package]] @@ -2564,9 +2482,9 @@ dependencies = [ [[package]] name = "siphasher" -version = "0.2.3" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" +checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" [[package]] name = "slab" @@ -2600,16 +2518,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "soup" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee42b8c117ede655c8ffe18dafcd239b23eb3bb7a2c71b1f01237587736f139f" -dependencies = [ - "html5ever", - "regex", -] - [[package]] name = "spin" version = "0.5.2" @@ -2688,38 +2596,29 @@ checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" [[package]] name = "string_cache" -version = "0.7.5" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89c058a82f9fd69b1becf8c274f412281038877c553182f1d02eb027045a2d67" +checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a" dependencies = [ "lazy_static", "new_debug_unreachable", "phf_shared", "precomputed-hash", "serde", - "string_cache_codegen", - "string_cache_shared", ] [[package]] name = "string_cache_codegen" -version = "0.4.4" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6" +checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" dependencies = [ "phf_generator", "phf_shared", "proc-macro2 1.0.24", "quote 1.0.8", - "string_cache_shared", ] -[[package]] -name = "string_cache_shared" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc" - [[package]] name = "subtle" version = "2.4.0" @@ -2879,7 +2778,7 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8190d04c665ea9e6b6a0dc45523ade572c088d2e6566244c1122671dbf4ae3a" dependencies = [ - "autocfg 1.0.1", + "autocfg", "bytes 1.0.1", "libc", "memchr", @@ -3321,6 +3220,18 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" +[[package]] +name = "xml5ever" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59" +dependencies = [ + "log 0.4.14", + "mac", + "markup5ever", + "time 0.1.44", +] + [[package]] name = "yansi" version = "0.5.0" diff --git a/Cargo.toml b/Cargo.toml index fcd2d76d..11df8c50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,7 +106,8 @@ newline-converter = "0.1.0" handlebars = { version = "3.5.2", features = ["dir_source"] } # For favicon extraction from main website -soup = "0.5.0" +html5ever = "0.25.1" +markup5ever_rcdom = "0.1.0" regex = { version = "1.4.3", features = ["std", "perf"], default-features = false } data-url = "0.1.0" diff --git a/src/api/icons.rs b/src/api/icons.rs index 60e5fc07..5abcf375 100644 --- a/src/api/icons.rs +++ b/src/api/icons.rs @@ -11,7 +11,6 @@ use once_cell::sync::Lazy; use regex::Regex; use reqwest::{blocking::Client, blocking::Response, header, Url}; use rocket::{http::ContentType, http::Cookie, response::Content, Route}; -use soup::prelude::*; use crate::{error::Error, util::Cached, CONFIG}; @@ -332,6 +331,42 @@ impl Icon { } } +fn get_favicons_node(node: &std::rc::Rc, icons: &mut Vec, url: &Url) { + if let markup5ever_rcdom::NodeData::Element { name, attrs, .. } = &node.data { + if name.local.as_ref() == "link" { + let mut has_rel = false; + let mut href = None; + let mut sizes = None; + + let attrs = attrs.borrow(); + for attr in attrs.iter() { + let attr_name = attr.name.local.as_ref(); + let attr_value = attr.value.as_ref(); + + if attr_name == "rel" && ICON_REL_REGEX.is_match(attr_value) { + has_rel = true; + } else if attr_name == "href" { + href = Some(attr_value); + } else if attr_name == "sizes" { + sizes = Some(attr_value); + } + } + + if has_rel && href.is_some() { + if let Ok(full_href) = url.join(&href.unwrap()).map(|h| h.into_string()) { + let priority = get_icon_priority(&full_href, sizes); + icons.push(Icon::new(priority, full_href)); + } + } + } + } + + // TODO: Might want to limit the recursion depth? + for child in node.children.borrow().iter() { + get_favicons_node(child, icons, url); + } +} + struct IconUrlResult { iconlist: Vec, cookies: String, @@ -431,30 +466,14 @@ fn get_icon_url(domain: &str) -> Result { // 512KB should be more than enough for the HTML, though as we only really need // the HTML header, it could potentially be reduced even further - let limited_reader = content.take(512 * 1024); + let mut limited_reader = content.take(512 * 1024); - let soup = Soup::from_reader(limited_reader)?; - // Search for and filter - let favicons = soup - .tag("link") - .attr("rel", ICON_REL_REGEX.clone()) // Only use icon rels - .attr_name("href") // Make sure there is a href - .find_all(); - - // Loop through all the found icons and determine it's priority - for favicon in favicons { - let sizes = favicon.get("sizes"); - let href = favicon.get("href").unwrap(); - // Skip invalid url's - let full_href = match url.join(&href) { - Ok(h) => h.into_string(), - _ => continue, - }; - - let priority = get_icon_priority(&full_href, sizes); - - iconlist.push(Icon::new(priority, full_href)) - } + use html5ever::tendril::TendrilSink; + let dom = html5ever::parse_document(markup5ever_rcdom::RcDom::default(), Default::default()) + .from_utf8() + .read_from(&mut limited_reader)?; + + get_favicons_node(&dom.document, &mut iconlist, &url); } else { // Add the default favicon.ico to the list with just the given domain iconlist.push(Icon::new(35, format!("{}/favicon.ico", ssldomain))); @@ -506,7 +525,7 @@ fn get_page_with_cookies(url: &str, cookie_str: &str, referer: &str) -> Result) -> u8 { +fn get_icon_priority(href: &str, sizes: Option<&str>) -> u8 { // Check if there is a dimension set let (width, height) = parse_sizes(sizes); @@ -554,7 +573,7 @@ fn get_icon_priority(href: &str, sizes: Option) -> u8 { /// let (width, height) = parse_sizes("x128x128"); // (128, 128) /// let (width, height) = parse_sizes("32"); // (0, 0) /// ``` -fn parse_sizes(sizes: Option) -> (u16, u16) { +fn parse_sizes(sizes: Option<&str>) -> (u16, u16) { let mut width: u16 = 0; let mut height: u16 = 0;