From 48baf723a422f0a4964807b12b002a589799d6ee Mon Sep 17 00:00:00 2001 From: BlackDex Date: Tue, 8 Dec 2020 17:33:15 +0100 Subject: [PATCH] Updated icon downloading - Added more checks to prevent panics (Removed unwrap) - Try do download from base domain or add www when the provided domain fails - Added some more domain validation checks to prevent errors - Added the ICON_BLACKLIST_REGEX to a Lazy Static HashMap which speeds-up the checks! - Validate the Regex before starting/config change. - Some cleanups - Disabled some noisy debugging from 2 crates. --- .env.template | 3 +- src/api/icons.rs | 162 +++++++++++++++++++++++++++++++++++------------ src/config.rs | 10 +++ src/main.rs | 3 + 4 files changed, 138 insertions(+), 40 deletions(-) diff --git a/.env.template b/.env.template index 9c30ca61..8e5fab73 100644 --- a/.env.template +++ b/.env.template @@ -104,7 +104,8 @@ ## Icon blacklist Regex ## Any domains or IPs that match this regex won't be fetched by the icon service. ## Useful to hide other servers in the local network. Check the WIKI for more details -# ICON_BLACKLIST_REGEX=192\.168\.1\.[0-9].*^ +## NOTE: Always enclose this regex withing single quotes! +# ICON_BLACKLIST_REGEX='^(192\.168\.0\.[0-9]+|192\.168\.1\.[0-9]+)$' ## Any IP which is not defined as a global IP will be blacklisted. ## Useful to secure your internal environment: See https://en.wikipedia.org/wiki/Reserved_IP_addresses for a list of IPs which it will block diff --git a/src/api/icons.rs b/src/api/icons.rs index f4e4edf7..9c3e7796 100644 --- a/src/api/icons.rs +++ b/src/api/icons.rs @@ -1,7 +1,9 @@ use std::{ + collections::HashMap, fs::{create_dir_all, remove_file, symlink_metadata, File}, io::prelude::*, net::{IpAddr, ToSocketAddrs}, + sync::RwLock, time::{Duration, SystemTime}, }; @@ -28,20 +30,48 @@ static CLIENT: Lazy = Lazy::new(|| { .unwrap() }); -static ICON_REL_REGEX: Lazy = Lazy::new(|| Regex::new(r"icon$|apple.*icon").unwrap()); -static ICON_HREF_REGEX: Lazy = - Lazy::new(|| Regex::new(r"(?i)\w+\.(jpg|jpeg|png|ico)(\?.*)?$|^data:image.*base64").unwrap()); +// Build Regex only once since this takes a lot of time. +static ICON_REL_REGEX: Lazy = Lazy::new(|| Regex::new(r"(?i)icon$|apple.*icon").unwrap()); static ICON_SIZE_REGEX: Lazy = Lazy::new(|| Regex::new(r"(?x)(\d+)\D*(\d+)").unwrap()); +// Special HashMap which holds the user defined Regex to speedup matching the regex. +static ICON_BLACKLIST_REGEX: Lazy>> = Lazy::new(|| RwLock::new(HashMap::new())); + +#[get("//icon.png")] +fn icon(domain: String) -> Option>>> { + if !is_valid_domain(&domain) { + warn!("Invalid domain: {}", domain); + return None; + } + + get_icon(&domain).map(|icon| Cached::long(Content(ContentType::new("image", "x-icon"), icon))) +} + +/// Returns if the domain provided is valid or not. +/// +/// This does some manual checks and makes use of Url to do some basic checking. +/// domains can't be larger then 63 characters (not counting multiple subdomains) according to the RFC's, but we limit the total size to 255. fn is_valid_domain(domain: &str) -> bool { - // Don't allow empty or too big domains or path traversal - if domain.is_empty() || domain.len() > 255 || domain.contains("..") { + // If parsing the domain fails using Url, it will not work with reqwest. + if let Err(parse_error) = Url::parse(format!("https://{}", domain).as_str()) { + debug!("Domain parse error: '{}' - {:?}", domain, parse_error); + return false; + } else if domain.is_empty() + || domain.contains("..") + || domain.starts_with('.') + || domain.starts_with('-') + || domain.ends_with('-') + { + debug!("Domain validation error: '{}' is either empty, contains '..', starts with an '.', starts or ends with a '-'", domain); + return false; + } else if domain.len() > 255 { + debug!("Domain validation error: '{}' exceeds 255 characters", domain); return false; } - // Only alphanumeric or specific characters for c in domain.chars() { if !c.is_alphanumeric() && !ALLOWED_CHARS.contains(c) { + debug!("Domain validation error: '{}' contains an invalid character '{}'", domain, c); return false; } } @@ -49,21 +79,10 @@ fn is_valid_domain(domain: &str) -> bool { true } -#[get("//icon.png")] -fn icon(domain: String) -> Option>>> { - if !is_valid_domain(&domain) { - warn!("Invalid domain: {:#?}", domain); - return None; - } - - get_icon(&domain).map(|icon| { - Cached::long(Content(ContentType::new("image", "x-icon"), icon)) - }) -} - /// TODO: This is extracted from IpAddr::is_global, which is unstable: /// https://doc.rust-lang.org/nightly/std/net/enum.IpAddr.html#method.is_global /// Remove once https://github.com/rust-lang/rust/issues/27709 is merged +#[allow(clippy::nonminimal_bool)] #[cfg(not(feature = "unstable"))] fn is_global(ip: IpAddr) -> bool { match ip { @@ -159,7 +178,7 @@ mod tests { } } -fn check_icon_domain_is_blacklisted(domain: &str) -> bool { +fn is_domain_blacklisted(domain: &str) -> bool { let mut is_blacklisted = CONFIG.icon_blacklist_non_global_ips() && (domain, 0) .to_socket_addrs() @@ -177,7 +196,31 @@ fn check_icon_domain_is_blacklisted(domain: &str) -> bool { // Skip the regex check if the previous one is true already if !is_blacklisted { if let Some(blacklist) = CONFIG.icon_blacklist_regex() { - let regex = Regex::new(&blacklist).expect("Valid Regex"); + let mut regex_hashmap = ICON_BLACKLIST_REGEX.read().unwrap(); + + // Use the pre-generate Regex stored in a Lazy HashMap if there's one, else generate it. + let regex = if let Some(regex) = regex_hashmap.get(&blacklist) { + regex + } else { + drop(regex_hashmap); + + let mut regex_hashmap_write = ICON_BLACKLIST_REGEX.write().unwrap(); + // Clear the current list if the previous key doesn't exists. + // To prevent growing of the HashMap after someone has changed it via the admin interface. + if regex_hashmap_write.len() >= 1 { + regex_hashmap_write.clear(); + } + + // Generate the regex to store in too the Lazy Static HashMap. + let blacklist_regex = Regex::new(&blacklist).unwrap(); + regex_hashmap_write.insert(blacklist.to_string(), blacklist_regex); + drop(regex_hashmap_write); + + regex_hashmap = ICON_BLACKLIST_REGEX.read().unwrap(); + regex_hashmap.get(&blacklist).unwrap() + }; + + // Use the pre-generate Regex stored in a Lazy HashMap. if regex.is_match(&domain) { warn!("Blacklisted domain: {:#?} matched {:#?}", domain, blacklist); is_blacklisted = true; @@ -213,8 +256,7 @@ fn get_icon(domain: &str) -> Option> { Err(e) => { error!("Error downloading icon: {:?}", e); let miss_indicator = path + ".miss"; - let empty_icon = Vec::new(); - save_icon(&miss_indicator, &empty_icon); + save_icon(&miss_indicator, &[]); None } } @@ -307,11 +349,51 @@ fn get_icon_url(domain: &str) -> Result<(Vec, String), Error> { // Some sites have extra security in place with for example XSRF Tokens. let mut cookie_str = String::new(); - let resp = get_page(&ssldomain).or_else(|_| get_page(&httpdomain)); + // First check the domain as given during the request for both HTTPS and HTTP. + let resp = match get_page(&ssldomain).or_else(|_| get_page(&httpdomain)) { + Ok(c) => Ok(c), + Err(e) => { + let mut sub_resp = Err(e); + + // When the domain is not an IP, and has more then one dot, remove all subdomains. + let is_ip = domain.parse::(); + if is_ip.is_err() && domain.matches('.').count() > 1 { + let mut domain_parts = domain.split('.'); + let base_domain = format!( + "{base}.{tld}", + tld = domain_parts.next_back().unwrap(), + base = domain_parts.next_back().unwrap() + ); + if is_valid_domain(&base_domain) { + let sslbase = format!("https://{}", base_domain); + let httpbase = format!("http://{}", base_domain); + debug!("[get_icon_url]: Trying without subdomains '{}'", base_domain); + + sub_resp = get_page(&sslbase).or_else(|_| get_page(&httpbase)); + } + + // When the domain is not an IP, and has less then 2 dots, try to add www. infront of it. + } else if is_ip.is_err() && domain.matches('.').count() < 2 { + let www_domain = format!("www.{}", domain); + if is_valid_domain(&www_domain) { + let sslwww = format!("https://{}", www_domain); + let httpwww = format!("http://{}", www_domain); + debug!("[get_icon_url]: Trying with www. prefix '{}'", www_domain); + + sub_resp = get_page(&sslwww).or_else(|_| get_page(&httpwww)); + } + } + + sub_resp + } + }; + if let Ok(content) = resp { // Extract the URL from the respose in case redirects occured (like @ gitlab.com) let url = content.url().clone(); + // Get all the cookies and pass it on to the next function. + // Needed for XSRF Cookies for example (like @ mijn.ing.nl) let raw_cookies = content.headers().get_all("set-cookie"); cookie_str = raw_cookies .iter() @@ -337,14 +419,18 @@ fn get_icon_url(domain: &str) -> Result<(Vec, String), Error> { let favicons = soup .tag("link") .attr("rel", ICON_REL_REGEX.clone()) // Only use icon rels - .attr("href", ICON_HREF_REGEX.clone()) // Only allow specific extensions + .attr_name("href") // Make sure there is a href .find_all(); // Loop through all the found icons and determine it's priority for favicon in favicons { let sizes = favicon.get("sizes"); - let href = favicon.get("href").expect("Missing href"); - let full_href = url.join(&href).unwrap().into_string(); + let href = favicon.get("href").unwrap(); + // Skip invalid url's + let full_href = match url.join(&href) { + Ok(h) => h.into_string(), + _ => continue, + }; let priority = get_icon_priority(&full_href, sizes); @@ -368,20 +454,18 @@ fn get_page(url: &str) -> Result { } fn get_page_with_cookies(url: &str, cookie_str: &str) -> Result { - if check_icon_domain_is_blacklisted(Url::parse(url).unwrap().host_str().unwrap_or_default()) { - err!("Favicon rel linked to a non blacklisted domain!"); + if is_domain_blacklisted(Url::parse(url).unwrap().host_str().unwrap_or_default()) { + err!("Favicon rel linked to a blacklisted domain!"); } - if cookie_str.is_empty() { - CLIENT.get(url).send()?.error_for_status().map_err(Into::into) - } else { - CLIENT - .get(url) - .header("cookie", cookie_str) - .send()? - .error_for_status() - .map_err(Into::into) + let mut client = CLIENT.get(url); + if !cookie_str.is_empty() { + client = client.header("cookie", cookie_str) } + + client.send()? + .error_for_status() + .map_err(Into::into) } /// Returns a Integer with the priority of the type of the icon which to prefer. @@ -464,7 +548,7 @@ fn parse_sizes(sizes: Option) -> (u16, u16) { } fn download_icon(domain: &str) -> Result, Error> { - if check_icon_domain_is_blacklisted(domain) { + if is_domain_blacklisted(domain) { err!("Domain is blacklisted", domain) } @@ -495,7 +579,7 @@ fn download_icon(domain: &str) -> Result, Error> { res.copy_to(&mut buffer)?; break; } - Err(_) => info!("Download failed for {}", icon.href), + Err(_) => warn!("Download failed for {}", icon.href), }; } } diff --git a/src/config.rs b/src/config.rs index 60dc317c..b5047cec 100644 --- a/src/config.rs +++ b/src/config.rs @@ -527,6 +527,16 @@ fn validate_config(cfg: &ConfigItems) -> Result<(), Error> { } } + // Check if the icon blacklist regex is valid + if let Some(ref r) = cfg.icon_blacklist_regex { + use regex::Regex; + let validate_regex = Regex::new(&r); + match validate_regex { + Ok(_) => (), + Err(e) => err!(format!("`ICON_BLACKLIST_REGEX` is invalid: {:#?}", e)), + } + } + Ok(()) } diff --git a/src/main.rs b/src/main.rs index 1da0d88e..f8d88af4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -113,6 +113,9 @@ fn init_logging(level: log::LevelFilter) -> Result<(), fern::InitError> { .level_for("launch_", log::LevelFilter::Off) .level_for("rocket::rocket", log::LevelFilter::Off) .level_for("rocket::fairing", log::LevelFilter::Off) + // Never show html5ever and hyper::proto logs, too noisy + .level_for("html5ever", log::LevelFilter::Off) + .level_for("hyper::proto", log::LevelFilter::Off) .chain(std::io::stdout()); // Enable smtp debug logging only specifically for smtp when need.