Use zero copy deserialization (#138)

This commit is contained in:
Ajeet D'Souza 2021-01-08 20:45:47 +05:30 committed by GitHub
parent ff16bf140c
commit dcdcec4a78
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 204 additions and 137 deletions

View File

@ -1,8 +1,8 @@
use super::Cmd; use super::Cmd;
use crate::config; use crate::config;
use crate::store::StoreBuilder;
use crate::util; use crate::util;
use crate::store::Store;
use anyhow::Result; use anyhow::Result;
use clap::Clap; use clap::Clap;
@ -40,7 +40,8 @@ impl Cmd for Add {
let data_dir = config::zo_data_dir()?; let data_dir = config::zo_data_dir()?;
let max_age = config::zo_maxage()?; let max_age = config::zo_maxage()?;
let mut store = Store::open(&data_dir)?; let mut store = StoreBuilder::new(data_dir);
let mut store = store.build()?;
store.add(path, now); store.add(path, now);
store.age(max_age); store.age(max_age);

View File

@ -3,7 +3,7 @@ use crate::config;
use crate::import::{Autojump, Import as _, Z}; use crate::import::{Autojump, Import as _, Z};
use crate::util; use crate::util;
use crate::store::Store; use crate::store::StoreBuilder;
use anyhow::{bail, Result}; use anyhow::{bail, Result};
use clap::{ArgEnum, Clap}; use clap::{ArgEnum, Clap};
@ -27,7 +27,8 @@ impl Cmd for Import {
fn run(&self) -> Result<()> { fn run(&self) -> Result<()> {
let data_dir = config::zo_data_dir()?; let data_dir = config::zo_data_dir()?;
let mut store = Store::open(&data_dir)?; let mut store = StoreBuilder::new(data_dir);
let mut store = store.build()?;
if !self.merge && !store.dirs.is_empty() { if !self.merge && !store.dirs.is_empty() {
bail!("zoxide database is not empty, specify --merge to continue anyway") bail!("zoxide database is not empty, specify --merge to continue anyway")
} }

View File

@ -3,7 +3,7 @@ use crate::config;
use crate::fzf::Fzf; use crate::fzf::Fzf;
use crate::util; use crate::util;
use crate::store::{self, Store}; use crate::store::{self, StoreBuilder};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use clap::Clap; use clap::Clap;
@ -30,7 +30,8 @@ pub struct Query {
impl Cmd for Query { impl Cmd for Query {
fn run(&self) -> Result<()> { fn run(&self) -> Result<()> {
let data_dir = config::zo_data_dir()?; let data_dir = config::zo_data_dir()?;
let mut store = Store::open(&data_dir)?; let mut store = StoreBuilder::new(data_dir);
let mut store = store.build()?;
let query = store::Query::new(&self.keywords); let query = store::Query::new(&self.keywords);
let now = util::current_time()?; let now = util::current_time()?;

View File

@ -1,8 +1,7 @@
use super::Cmd; use super::Cmd;
use crate::config; use crate::config;
use crate::fzf::Fzf; use crate::fzf::Fzf;
use crate::store::Query; use crate::store::{Query, StoreBuilder};
use crate::store::Store;
use crate::util; use crate::util;
use anyhow::{bail, Context, Result}; use anyhow::{bail, Context, Result};
@ -25,7 +24,8 @@ pub struct Remove {
impl Cmd for Remove { impl Cmd for Remove {
fn run(&self) -> Result<()> { fn run(&self) -> Result<()> {
let data_dir = config::zo_data_dir()?; let data_dir = config::zo_data_dir()?;
let mut store = Store::open(&data_dir)?; let mut store = StoreBuilder::new(data_dir);
let mut store = store.build()?;
let selection; let selection;
let path = match &self.interactive { let path = match &self.interactive {

View File

@ -1,4 +1,5 @@
use crate::store::Rank; use crate::store::Rank;
use anyhow::{bail, Context, Result}; use anyhow::{bail, Context, Result};
use dirs_next as dirs; use dirs_next as dirs;

View File

@ -3,6 +3,7 @@ use super::Import;
use crate::store::{Dir, Epoch, Store}; use crate::store::{Dir, Epoch, Store};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use std::borrow::Cow;
use std::fs; use std::fs;
use std::path::Path; use std::path::Path;
@ -43,15 +44,10 @@ impl Import for Autojump {
} }
let rank_sum = entries.iter().map(|(_, rank)| rank).sum::<f64>(); let rank_sum = entries.iter().map(|(_, rank)| rank).sum::<f64>();
for (path, rank) in entries.iter() { for &(path, rank) in entries.iter() {
if store if store.dirs.iter_mut().find(|dir| dir.path == path).is_none() {
.dirs
.iter_mut()
.find(|dir| &dir.path == path)
.is_none()
{
store.dirs.push(Dir { store.dirs.push(Dir {
path: path.to_string(), path: Cow::Owned(path.into()),
rank: rank / rank_sum, rank: rank / rank_sum,
last_accessed: self.now, last_accessed: self.now,
}); });

View File

@ -3,6 +3,7 @@ use super::Import;
use crate::store::{Dir, Store}; use crate::store::{Dir, Store};
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader};
use std::path::Path; use std::path::Path;
@ -46,7 +47,7 @@ impl Import for Z {
dir.last_accessed = dir.last_accessed.max(last_accessed); dir.last_accessed = dir.last_accessed.max(last_accessed);
} }
None => store.dirs.push(Dir { None => store.dirs.push(Dir {
path: path.to_string(), path: Cow::Owned(path.into()),
rank, rank,
last_accessed, last_accessed,
}), }),

View File

@ -1,27 +1,110 @@
use super::{Epoch, Query, Rank}; use super::Query;
use anyhow::{bail, Context, Result};
use bincode::Options as _;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use std::fmt::{self, Display, Formatter}; use std::fmt::{self, Display, Formatter};
use std::ops::{Deref, DerefMut};
use std::path::Path; use std::path::Path;
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize)]
pub struct Dir { pub struct DirList<'a>(#[serde(borrow)] Vec<Dir<'a>>);
pub path: String,
impl DirList<'_> {
const VERSION: u32 = 3;
pub fn new() -> DirList<'static> {
DirList(Vec::new())
}
pub fn from_bytes(bytes: &[u8]) -> Result<DirList> {
// Assume a maximum size for the store. This prevents bincode from throwing strange
// errors when it encounters invalid data.
const MAX_SIZE: u64 = 8 << 20; // 8 MiB
let deserializer = &mut bincode::options()
.with_fixint_encoding()
.with_limit(MAX_SIZE);
// Split bytes into sections.
let version_size = deserializer.serialized_size(&Self::VERSION).unwrap() as _;
if bytes.len() < version_size {
bail!("could not deserialize store: corrupted data");
}
let (bytes_version, bytes_dirs) = bytes.split_at(version_size);
// Deserialize sections.
(|| {
let version = deserializer.deserialize(bytes_version)?;
match version {
Self::VERSION => Ok(deserializer.deserialize(bytes_dirs)?),
version => bail!(
"unsupported version (got {}, supports {})",
version,
Self::VERSION,
),
}
})()
.context("could not deserialize store")
}
pub fn to_bytes(&self) -> Result<Vec<u8>> {
(|| -> bincode::Result<_> {
// Preallocate buffer with combined size of sections.
let version_size = bincode::serialized_size(&Self::VERSION)?;
let dirs_size = bincode::serialized_size(&self)?;
let buffer_size = version_size + dirs_size;
let mut buffer = Vec::with_capacity(buffer_size as _);
// Serialize sections into buffer.
bincode::serialize_into(&mut buffer, &Self::VERSION)?;
bincode::serialize_into(&mut buffer, &self)?;
Ok(buffer)
})()
.context("could not serialize store")
}
}
impl<'a> Deref for DirList<'a> {
type Target = Vec<Dir<'a>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<'a> DerefMut for DirList<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl<'a> From<Vec<Dir<'a>>> for DirList<'a> {
fn from(dirs: Vec<Dir<'a>>) -> Self {
DirList(dirs)
}
}
#[derive(Debug, Deserialize, Serialize)]
pub struct Dir<'a> {
#[serde(borrow)]
pub path: Cow<'a, str>,
pub rank: Rank, pub rank: Rank,
pub last_accessed: Epoch, pub last_accessed: Epoch,
} }
impl Dir { impl Dir<'_> {
pub fn is_match(&self, query: &Query) -> bool { pub fn is_match(&self, query: &Query) -> bool {
query.matches(&self.path) && Path::new(&self.path).is_dir() query.matches(&self.path) && Path::new(self.path.as_ref()).is_dir()
} }
pub fn get_score(&self, now: Epoch) -> Rank { pub fn score(&self, now: Epoch) -> Rank {
const HOUR: Epoch = 60 * 60; const HOUR: Epoch = 60 * 60;
const DAY: Epoch = 24 * HOUR; const DAY: Epoch = 24 * HOUR;
const WEEK: Epoch = 7 * DAY; const WEEK: Epoch = 7 * DAY;
// The older the entry, the lesser its importance.
let duration = now.saturating_sub(self.last_accessed); let duration = now.saturating_sub(self.last_accessed);
if duration < HOUR { if duration < HOUR {
self.rank * 4.0 self.rank * 4.0
@ -44,7 +127,7 @@ impl Dir {
} }
pub struct DirDisplay<'a> { pub struct DirDisplay<'a> {
dir: &'a Dir, dir: &'a Dir<'a>,
} }
impl Display for DirDisplay<'_> { impl Display for DirDisplay<'_> {
@ -54,13 +137,13 @@ impl Display for DirDisplay<'_> {
} }
pub struct DirDisplayScore<'a> { pub struct DirDisplayScore<'a> {
dir: &'a Dir, dir: &'a Dir<'a>,
now: Epoch, now: Epoch,
} }
impl Display for DirDisplayScore<'_> { impl Display for DirDisplayScore<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
let score = self.dir.get_score(self.now); let score = self.dir.score(self.now);
let score = if score > 9999.0 { let score = if score > 9999.0 {
9999 9999
} else if score > 0.0 { } else if score > 0.0 {
@ -71,3 +154,6 @@ impl Display for DirDisplayScore<'_> {
write!(f, "{:>4} {}", score, self.dir.path) write!(f, "{:>4} {}", score, self.dir.path)
} }
} }
pub type Rank = f64;
pub type Epoch = u64;

View File

@ -1,111 +1,32 @@
mod dir; mod dir;
mod query; mod query;
use anyhow::{bail, Context, Result}; pub use dir::{Dir, DirList, Epoch, Rank};
use bincode::Options; pub use query::Query;
use anyhow::{Context, Result};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use serde::{Deserialize, Serialize};
use tempfile::{NamedTempFile, PersistError}; use tempfile::{NamedTempFile, PersistError};
use std::borrow::Cow;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::fs; use std::fs;
use std::io::{self, Write}; use std::io::{self, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
pub use dir::Dir; pub struct Store<'a> {
pub use query::Query; pub dirs: DirList<'a>,
pub type Rank = f64;
pub type Epoch = u64;
#[derive(Debug)]
pub struct Store {
pub dirs: Vec<Dir>,
pub modified: bool, pub modified: bool,
data_dir: PathBuf, data_dir: &'a Path,
} }
impl Store { impl<'a> Store<'a> {
pub const CURRENT_VERSION: StoreVersion = StoreVersion(3);
const MAX_SIZE: u64 = 8 * 1024 * 1024; // 8 MiB
pub fn open<P: Into<PathBuf>>(data_dir: P) -> Result<Store> {
let data_dir = data_dir.into();
let path = Self::get_path(&data_dir);
let buffer = match fs::read(&path) {
Ok(buffer) => buffer,
Err(e) if e.kind() == io::ErrorKind::NotFound => {
fs::create_dir_all(&data_dir).with_context(|| {
format!("unable to create data directory: {}", path.display())
})?;
return Ok(Store {
dirs: Vec::new(),
modified: false,
data_dir,
});
}
Err(e) => {
Err(e).with_context(|| format!("could not read from store: {}", path.display()))?
}
};
let deserializer = &mut bincode::options()
.with_fixint_encoding()
.with_limit(Self::MAX_SIZE);
let version_size = deserializer
.serialized_size(&Self::CURRENT_VERSION)
.unwrap() as _;
if buffer.len() < version_size {
bail!("data store may be corrupted: {}", path.display());
}
let (buffer_version, buffer_dirs) = buffer.split_at(version_size);
let version = deserializer
.deserialize(buffer_version)
.with_context(|| format!("could not deserialize store version: {}", path.display()))?;
let dirs = match version {
Self::CURRENT_VERSION => deserializer
.deserialize(buffer_dirs)
.with_context(|| format!("could not deserialize store: {}", path.display()))?,
version => bail!(
"unsupported store version, got={}, supported={}: {}",
version.0,
Self::CURRENT_VERSION.0,
path.display()
),
};
Ok(Store {
dirs,
modified: false,
data_dir,
})
}
pub fn save(&mut self) -> Result<()> { pub fn save(&mut self) -> Result<()> {
if !self.modified { if !self.modified {
return Ok(()); return Ok(());
} }
let (buffer, buffer_size) = (|| -> bincode::Result<_> { let buffer = self.dirs.to_bytes()?;
let version_size = bincode::serialized_size(&Self::CURRENT_VERSION)?;
let dirs_size = bincode::serialized_size(&self.dirs)?;
let buffer_size = version_size + dirs_size;
let mut buffer = Vec::with_capacity(buffer_size as _);
bincode::serialize_into(&mut buffer, &Self::CURRENT_VERSION)?;
bincode::serialize_into(&mut buffer, &self.dirs)?;
Ok((buffer, buffer_size))
})()
.context("could not serialize store")?;
let mut file = NamedTempFile::new_in(&self.data_dir).with_context(|| { let mut file = NamedTempFile::new_in(&self.data_dir).with_context(|| {
format!( format!(
"could not create temporary store in: {}", "could not create temporary store in: {}",
@ -113,7 +34,10 @@ impl Store {
) )
})?; })?;
let _ = file.as_file().set_len(buffer_size); // Preallocate enough space on the file, preventing copying later on.
// This optimization may fail on some filesystems, but it is safe to
// ignore it and proceed.
let _ = file.as_file().set_len(buffer.len() as _);
file.write_all(&buffer).with_context(|| { file.write_all(&buffer).with_context(|| {
format!( format!(
"could not write to temporary store: {}", "could not write to temporary store: {}",
@ -121,7 +45,7 @@ impl Store {
) )
})?; })?;
let path = Self::get_path(&self.data_dir); let path = store_path(&self.data_dir);
persist(file, &path) persist(file, &path)
.with_context(|| format!("could not replace store: {}", path.display()))?; .with_context(|| format!("could not replace store: {}", path.display()))?;
@ -135,7 +59,7 @@ impl Store {
match self.dirs.iter_mut().find(|dir| dir.path == path) { match self.dirs.iter_mut().find(|dir| dir.path == path) {
None => self.dirs.push(Dir { None => self.dirs.push(Dir {
path: path.into(), path: Cow::Owned(path.into()),
last_accessed: now, last_accessed: now,
rank: 1.0, rank: 1.0,
}), }),
@ -148,13 +72,13 @@ impl Store {
self.modified = true; self.modified = true;
} }
pub fn iter_matches<'a>( pub fn iter_matches<'b>(
&'a mut self, &'b mut self,
query: &'a Query, query: &'b Query,
now: Epoch, now: Epoch,
) -> impl DoubleEndedIterator<Item = &'a Dir> { ) -> impl DoubleEndedIterator<Item = &'b Dir> {
self.dirs self.dirs
.sort_unstable_by_key(|dir| Reverse(OrderedFloat(dir.get_score(now)))); .sort_unstable_by_key(|dir| Reverse(OrderedFloat(dir.score(now))));
self.dirs.iter().filter(move |dir| dir.is_match(&query)) self.dirs.iter().filter(move |dir| dir.is_match(&query))
} }
@ -188,23 +112,18 @@ impl Store {
self.modified = true; self.modified = true;
} }
} }
fn get_path<P: AsRef<Path>>(data_dir: P) -> PathBuf {
data_dir.as_ref().join("db.zo")
}
} }
impl Drop for Store { impl Drop for Store<'_> {
fn drop(&mut self) { fn drop(&mut self) {
// Since the error can't be properly handled here,
// pretty-print it instead.
if let Err(e) = self.save() { if let Err(e) = self.save() {
println!("Error: {}", e) println!("Error: {}", e)
} }
} }
} }
#[derive(Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct StoreVersion(pub u32);
#[cfg(windows)] #[cfg(windows)]
fn persist<P: AsRef<Path>>(mut file: NamedTempFile, path: P) -> Result<(), PersistError> { fn persist<P: AsRef<Path>>(mut file: NamedTempFile, path: P) -> Result<(), PersistError> {
use rand::distributions::{Distribution, Uniform}; use rand::distributions::{Distribution, Uniform};
@ -240,6 +159,62 @@ fn persist<P: AsRef<Path>>(file: NamedTempFile, path: P) -> Result<(), PersistEr
Ok(()) Ok(())
} }
pub struct StoreBuilder {
data_dir: PathBuf,
buffer: Vec<u8>,
}
impl StoreBuilder {
pub fn new<P: Into<PathBuf>>(data_dir: P) -> StoreBuilder {
StoreBuilder {
data_dir: data_dir.into(),
buffer: Vec::new(),
}
}
pub fn build(&mut self) -> Result<Store> {
// Read the entire store to memory. For smaller files, this is faster
// than mmap / streaming, and allows for zero-copy deserialization.
let path = store_path(&self.data_dir);
match fs::read(&path) {
Ok(buffer) => {
self.buffer = buffer;
let dirs = DirList::from_bytes(&self.buffer)
.with_context(|| format!("could not deserialize store: {}", path.display()))?;
Ok(Store {
dirs,
modified: false,
data_dir: &self.data_dir,
})
}
Err(e) if e.kind() == io::ErrorKind::NotFound => {
// Create data directory, but don't create any file yet.
// The file will be created later by [`Store::save`]
// if any data is modified.
fs::create_dir_all(&self.data_dir).with_context(|| {
format!(
"unable to create data directory: {}",
self.data_dir.display()
)
})?;
Ok(Store {
dirs: DirList::new(),
modified: false,
data_dir: &self.data_dir,
})
}
Err(e) => {
Err(e).with_context(|| format!("could not read from store: {}", path.display()))
}
}
}
}
fn store_path<P: AsRef<Path>>(data_dir: P) -> PathBuf {
const STORE_FILENAME: &str = "db.zo";
data_dir.as_ref().join(STORE_FILENAME)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -255,12 +230,14 @@ mod tests {
let data_dir = tempfile::tempdir().unwrap(); let data_dir = tempfile::tempdir().unwrap();
{ {
let mut store = Store::open(data_dir.path()).unwrap(); let mut store = StoreBuilder::new(data_dir.path());
let mut store = store.build().unwrap();
store.add(path, now); store.add(path, now);
store.add(path, now); store.add(path, now);
} }
{ {
let store = Store::open(data_dir.path()).unwrap(); let mut store = StoreBuilder::new(data_dir.path());
let store = store.build().unwrap();
assert_eq!(store.dirs.len(), 1); assert_eq!(store.dirs.len(), 1);
let dir = &store.dirs[0]; let dir = &store.dirs[0];
@ -280,15 +257,18 @@ mod tests {
let data_dir = tempfile::tempdir().unwrap(); let data_dir = tempfile::tempdir().unwrap();
{ {
let mut store = Store::open(data_dir.path()).unwrap(); let mut store = StoreBuilder::new(data_dir.path());
let mut store = store.build().unwrap();
store.add(path, now); store.add(path, now);
} }
{ {
let mut store = Store::open(data_dir.path()).unwrap(); let mut store = StoreBuilder::new(data_dir.path());
let mut store = store.build().unwrap();
assert!(store.remove(path)); assert!(store.remove(path));
} }
{ {
let mut store = Store::open(data_dir.path()).unwrap(); let mut store = StoreBuilder::new(data_dir.path());
let mut store = store.build().unwrap();
assert!(store.dirs.is_empty()); assert!(store.dirs.is_empty());
assert!(!store.remove(path)); assert!(!store.remove(path));
} }