From 6cdcb3b2976be758987a36491d2da4ecccad941f Mon Sep 17 00:00:00 2001 From: BlackDex Date: Fri, 4 Aug 2023 16:50:38 +0200 Subject: [PATCH] Optimized Favicon downloading Some optimizations in regards to downloading Favicon's. I also encounterd some issues with accessing some sites where the connection got dropped or closed early. This seems a reqwest/hyper thingy, https://github.com/hyperium/hyper/issues/2136. This is now also fixed. General: - Decreased struct size - Decreased memory allocations - Optimized tokenizer a bit more to only emit tags when all attributes are there and are valid. reqwest/hyper connection issue: The following changes helped solve the connection issues to some sites. The endresult is that some icons are now able to be downloaded always instead of sometimes. - Enabled some extra reqwest features, `deflate` and `native-tls-alpn` (Which do not bring in any extra crates since other crates already enabled them, but they were not active for Vaultwarden it self) - Configured reqwest to have a max amount of idle pool connections per host - Configured reqwest to timeout the idle connections in 10 seconds --- Cargo.toml | 2 +- src/api/icons.rs | 242 ++++++++++++++++++++++++----------------------- 2 files changed, 126 insertions(+), 118 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ce211917..6875db8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -124,7 +124,7 @@ email_address = "0.2.4" handlebars = { version = "4.3.7", features = ["dir_source"] } # HTTP client (Used for favicons, version check, DUO and HIBP API) -reqwest = { version = "0.11.18", features = ["stream", "json", "gzip", "brotli", "socks", "cookies", "trust-dns"] } +reqwest = { version = "0.11.18", features = ["stream", "json", "deflate", "gzip", "brotli", "socks", "cookies", "trust-dns", "native-tls-alpn"] } # Favicon extraction libraries html5gum = "0.5.7" diff --git a/src/api/icons.rs b/src/api/icons.rs index dcf985f7..124f3a8f 100644 --- a/src/api/icons.rs +++ b/src/api/icons.rs @@ -19,7 +19,7 @@ use tokio::{ net::lookup_host, }; -use html5gum::{Emitter, EndTag, HtmlString, InfallibleTokenizer, Readable, StartTag, StringReader, Tokenizer}; +use html5gum::{Emitter, HtmlString, InfallibleTokenizer, Readable, StringReader, Tokenizer}; use crate::{ error::Error, @@ -46,10 +46,15 @@ static CLIENT: Lazy = Lazy::new(|| { // Generate the cookie store let cookie_store = Arc::new(Jar::default()); + let icon_download_timeout = Duration::from_secs(CONFIG.icon_download_timeout()); + let pool_idle_timeout = Duration::from_secs(10); // Reuse the client between requests let client = get_reqwest_client_builder() .cookie_provider(Arc::clone(&cookie_store)) - .timeout(Duration::from_secs(CONFIG.icon_download_timeout())) + .timeout(icon_download_timeout) + .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections + .pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds + .trust_dns(true) .default_headers(default_headers.clone()); match client.build() { @@ -58,9 +63,11 @@ static CLIENT: Lazy = Lazy::new(|| { error!("Possible trust-dns error, trying with trust-dns disabled: '{e}'"); get_reqwest_client_builder() .cookie_provider(cookie_store) - .timeout(Duration::from_secs(CONFIG.icon_download_timeout())) - .default_headers(default_headers) + .timeout(icon_download_timeout) + .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections + .pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds .trust_dns(false) + .default_headers(default_headers) .build() .expect("Failed to build client") } @@ -258,7 +265,7 @@ mod tests { } } -#[derive(Debug, Clone)] +#[derive(Clone)] enum DomainBlacklistReason { Regex, IP, @@ -415,38 +422,34 @@ fn get_favicons_node( const TAG_LINK: &[u8] = b"link"; const TAG_BASE: &[u8] = b"base"; const TAG_HEAD: &[u8] = b"head"; - const ATTR_REL: &[u8] = b"rel"; const ATTR_HREF: &[u8] = b"href"; const ATTR_SIZES: &[u8] = b"sizes"; let mut base_url = url.clone(); - let mut icon_tags: Vec = Vec::new(); + let mut icon_tags: Vec = Vec::new(); for token in dom { - match token { - FaviconToken::StartTag(tag) => { - if *tag.name == TAG_LINK - && tag.attributes.contains_key(ATTR_REL) - && tag.attributes.contains_key(ATTR_HREF) - { - let rel_value = std::str::from_utf8(tag.attributes.get(ATTR_REL).unwrap()) - .unwrap_or_default() - .to_ascii_lowercase(); - if rel_value.contains("icon") && !rel_value.contains("mask-icon") { - icon_tags.push(tag); - } - } else if *tag.name == TAG_BASE && tag.attributes.contains_key(ATTR_HREF) { - let href = std::str::from_utf8(tag.attributes.get(ATTR_HREF).unwrap()).unwrap_or_default(); - debug!("Found base href: {href}"); - base_url = match base_url.join(href) { - Ok(inner_url) => inner_url, - _ => url.clone(), - }; - } + let tag_name: &[u8] = &token.tag.name; + match tag_name { + TAG_LINK => { + icon_tags.push(token.tag); } - FaviconToken::EndTag(tag) => { - if *tag.name == TAG_HEAD { - break; - } + TAG_BASE => { + base_url = if let Some(href) = token.tag.attributes.get(ATTR_HREF) { + let href = std::str::from_utf8(href).unwrap_or_default(); + debug!("Found base href: {href}"); + match base_url.join(href) { + Ok(inner_url) => inner_url, + _ => continue, + } + } else { + continue; + }; + } + TAG_HEAD if token.closing => { + break; + } + _ => { + continue; } } } @@ -820,43 +823,64 @@ impl reqwest::cookie::CookieStore for Jar { } /// Custom FaviconEmitter for the html5gum parser. -/// The FaviconEmitter is using an almost 1:1 copy of the DefaultEmitter with some small changes. +/// The FaviconEmitter is using an optimized version of the DefaultEmitter. /// This prevents emitting tags like comments, doctype and also strings between the tags. +/// But it will also only emit the tags we need and only if they have the correct attributes /// Therefor parsing the HTML content is faster. -use std::collections::{BTreeSet, VecDeque}; +use std::collections::BTreeMap; -#[derive(Debug)] -enum FaviconToken { - StartTag(StartTag), - EndTag(EndTag), +#[derive(Default)] +pub struct Tag { + /// The tag's name, such as `"link"` or `"base"`. + pub name: HtmlString, + + /// A mapping for any HTML attributes this start tag may have. + /// + /// Duplicate attributes are ignored after the first one as per WHATWG spec. + pub attributes: BTreeMap, } -#[derive(Default, Debug)] +struct FaviconToken { + tag: Tag, + closing: bool, +} + +#[derive(Default)] struct FaviconEmitter { current_token: Option, last_start_tag: HtmlString, current_attribute: Option<(HtmlString, HtmlString)>, - seen_attributes: BTreeSet, - emitted_tokens: VecDeque, + emit_token: bool, } impl FaviconEmitter { - fn emit_token(&mut self, token: FaviconToken) { - self.emitted_tokens.push_front(token); - } + fn flush_current_attribute(&mut self, emit_current_tag: bool) { + const ATTR_HREF: &[u8] = b"href"; + const ATTR_REL: &[u8] = b"rel"; + const TAG_LINK: &[u8] = b"link"; + const TAG_BASE: &[u8] = b"base"; + const TAG_HEAD: &[u8] = b"head"; - fn flush_current_attribute(&mut self) { - if let Some((k, v)) = self.current_attribute.take() { - match self.current_token { - Some(FaviconToken::StartTag(ref mut tag)) => { - tag.attributes.entry(k).and_modify(|_| {}).or_insert(v); - } - Some(FaviconToken::EndTag(_)) => { - self.seen_attributes.insert(k); - } - _ => { - debug_assert!(false); + if let Some(ref mut token) = self.current_token { + let tag_name: &[u8] = &token.tag.name; + + if self.current_attribute.is_some() && (tag_name == TAG_BASE || tag_name == TAG_LINK) { + let (k, v) = self.current_attribute.take().unwrap(); + token.tag.attributes.entry(k).and_modify(|_| {}).or_insert(v); + } + + let tag_attr = &token.tag.attributes; + match tag_name { + TAG_HEAD if token.closing => self.emit_token = true, + TAG_BASE if tag_attr.contains_key(ATTR_HREF) => self.emit_token = true, + TAG_LINK if emit_current_tag && tag_attr.contains_key(ATTR_REL) && tag_attr.contains_key(ATTR_HREF) => { + let rel_value = + std::str::from_utf8(token.tag.attributes.get(ATTR_REL).unwrap()).unwrap_or_default(); + if rel_value.contains("icon") && !rel_value.contains("mask-icon") { + self.emit_token = true + } } + _ => (), } } } @@ -871,87 +895,71 @@ impl Emitter for FaviconEmitter { } fn pop_token(&mut self) -> Option { - self.emitted_tokens.pop_back() - } - - fn init_start_tag(&mut self) { - self.current_token = Some(FaviconToken::StartTag(StartTag::default())); - } - - fn init_end_tag(&mut self) { - self.current_token = Some(FaviconToken::EndTag(EndTag::default())); - self.seen_attributes.clear(); - } - - fn emit_current_tag(&mut self) -> Option { - self.flush_current_attribute(); - let mut token = self.current_token.take().unwrap(); - let mut emit = false; - match token { - FaviconToken::EndTag(ref mut tag) => { - // Always clean seen attributes - self.seen_attributes.clear(); - self.set_last_start_tag(None); - - // Only trigger an emit for the tag. - // This is matched, and will break the for-loop. - if *tag.name == b"head" { - emit = true; - } - } - FaviconToken::StartTag(ref mut tag) => { - // Only trriger an emit for and tags. - // These are the only tags we want to parse. - if *tag.name == b"link" || *tag.name == b"base" { - self.set_last_start_tag(Some(&tag.name)); - emit = true; - } else { - self.set_last_start_tag(None); - } - } - } - - // Only emit the tags we want to parse. - if emit { - self.emit_token(token); + if self.emit_token { + self.emit_token = false; + return self.current_token.take(); } None } + fn init_start_tag(&mut self) { + self.current_token = Some(FaviconToken { + tag: Tag::default(), + closing: false, + }); + } + + fn init_end_tag(&mut self) { + self.current_token = Some(FaviconToken { + tag: Tag::default(), + closing: true, + }); + } + + fn emit_current_tag(&mut self) -> Option { + self.flush_current_attribute(true); + self.last_start_tag.clear(); + if self.current_token.is_some() && !self.current_token.as_ref().unwrap().closing { + self.last_start_tag.extend(&*self.current_token.as_ref().unwrap().tag.name); + } + html5gum::naive_next_state(&self.last_start_tag) + } + fn push_tag_name(&mut self, s: &[u8]) { - match self.current_token { - Some( - FaviconToken::StartTag(StartTag { - ref mut name, - .. - }) - | FaviconToken::EndTag(EndTag { - ref mut name, - .. - }), - ) => { - name.extend(s); - } - _ => debug_assert!(false), + if let Some(ref mut token) = self.current_token { + token.tag.name.extend(s); } } fn init_attribute(&mut self) { - self.flush_current_attribute(); - self.current_attribute = Some(Default::default()); + self.flush_current_attribute(false); + self.current_attribute = match &self.current_token { + Some(token) => { + let tag_name: &[u8] = &token.tag.name; + match tag_name { + b"link" | b"head" | b"base" => Some(Default::default()), + _ => None, + } + } + _ => None, + }; } fn push_attribute_name(&mut self, s: &[u8]) { - self.current_attribute.as_mut().unwrap().0.extend(s); + if let Some(attr) = &mut self.current_attribute { + attr.0.extend(s) + } } fn push_attribute_value(&mut self, s: &[u8]) { - self.current_attribute.as_mut().unwrap().1.extend(s); + if let Some(attr) = &mut self.current_attribute { + attr.1.extend(s) + } } fn current_is_appropriate_end_tag_token(&mut self) -> bool { - match self.current_token { - Some(FaviconToken::EndTag(ref tag)) => !self.last_start_tag.is_empty() && self.last_start_tag == tag.name, + match &self.current_token { + Some(token) if token.closing => !self.last_start_tag.is_empty() && self.last_start_tag == token.tag.name, _ => false, } }