blob: bcac48539e039ca0c443d72f426c738d43e90691 [file]
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use elsa::sync::FrozenMap;
use icu_provider::prelude::*;
use std::any::Any;
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::fmt::Debug;
#[cfg(feature = "networking")]
use std::fs::File;
#[cfg(feature = "networking")]
use std::io::BufWriter;
use std::io::Cursor;
use std::io::Read;
use std::path::Path;
use std::path::PathBuf;
use std::sync::OnceLock;
use std::sync::RwLock;
use zip::ZipArchive;
pub(crate) struct SerdeCache {
pub(crate) root: AbstractFs,
cache: FrozenMap<String, Box<dyn Any + Send + Sync>>,
}
impl Debug for SerdeCache {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// skip formatting the cache
let _ = &self.cache;
f.debug_struct("SerdeCache")
.field("root", &self.root)
.finish()
}
}
impl SerdeCache {
pub fn new(root: AbstractFs) -> Self {
Self {
root,
cache: FrozenMap::new(),
}
}
fn read_and_parse<S>(
&self,
path: &str,
parser: fn(&[u8]) -> Result<S, DataError>,
) -> Result<&S, DataError>
where
for<'de> S: serde::Deserialize<'de> + 'static + Send + Sync,
{
match self.cache.get(path) {
Some(x) => x,
None => self.cache.insert(
path.to_string(),
Box::new(
parser(&self.root.read_to_buf(path)?)
.map_err(|e| e.with_path_context(Path::new(path)))?,
),
),
}
.downcast_ref::<S>()
.ok_or_else(|| DataError::custom("Cache error").with_type_context::<S>())
}
pub fn read_and_parse_json<S>(&self, path: &str) -> Result<&S, DataError>
where
for<'de> S: serde::Deserialize<'de> + 'static + Send + Sync,
{
self.read_and_parse(path, |bytes| {
serde_json::from_slice(bytes)
.map_err(|e| DataError::custom("JSON deserialize").with_display_context(&e))
})
}
pub fn read_and_parse_toml<S>(&self, path: &str) -> Result<&S, DataError>
where
for<'de> S: serde::Deserialize<'de> + 'static + Send + Sync,
{
self.read_and_parse(path, |bytes| {
toml::from_str(
std::str::from_utf8(bytes)
.map_err(|e| DataError::custom("TOML UTF8").with_display_context(&e))?,
)
.map_err(|e| DataError::custom("TOML deserialize").with_display_context(&e))
})
}
pub fn list(&self, path: &str) -> Result<impl Iterator<Item = String>, DataError> {
self.root.list(path)
}
pub fn file_exists(&self, path: &str) -> Result<bool, DataError> {
self.root.file_exists(path)
}
}
pub(crate) struct ZipData {
archive: ZipArchive<Cursor<Vec<u8>>>,
file_list: HashSet<String>,
}
impl ZipData {
fn try_new(bytes: Vec<u8>) -> Result<Self, DataError> {
let archive = ZipArchive::new(Cursor::new(bytes))
.map_err(|e| DataError::custom("Invalid ZIP file").with_display_context(&e))?;
let file_list = archive.file_names().map(String::from).collect();
Ok(Self { archive, file_list })
}
}
pub(crate) struct TarArchive {
archive: Vec<u8>,
file_list: HashSet<String>,
}
impl TarArchive {
fn try_new(bytes: Vec<u8>) -> Result<Self, DataError> {
use std::io::Read;
let mut archive = Vec::new();
flate2::read::GzDecoder::new(Cursor::new(bytes)).read_to_end(&mut archive)?;
let file_list = tar::Archive::new(Cursor::new(&archive))
.entries_with_seek()
.map(|e| {
e.into_iter()
.filter_map(|e| Some(e.ok()?.path().ok()?.as_os_str().to_str()?.to_string()))
})?
.collect::<HashSet<_>>();
Ok(TarArchive { archive, file_list })
}
}
pub(crate) enum AbstractFs {
Fs(PathBuf),
Zip(RwLock<Result<ZipData, String>>),
Tar(RwLock<Result<TarArchive, String>>),
#[cfg(feature = "networking")]
Http(String),
Memory(BTreeMap<&'static str, &'static [u8]>),
}
impl Debug for AbstractFs {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AbstractFs").finish()
}
}
impl AbstractFs {
pub fn new(root: &Path) -> Result<Self, DataError> {
if std::fs::metadata(root)
.map_err(|e| DataError::from(e).with_path_context(root))?
.is_dir()
{
Ok(Self::Fs(root.to_path_buf()))
} else if root.extension().is_some_and(|ext| ext == "zip") {
Ok(Self::Zip(RwLock::new(Ok(ZipData::try_new(
std::fs::read(root)?,
)?))))
} else if root.extension().is_some_and(|ext| ext == "gz") {
Ok(Self::Tar(RwLock::new(Ok(TarArchive::try_new(
std::fs::read(root)?,
)?))))
} else {
Err(DataError::custom("unsupported archive type").with_display_context(&root.display()))
}
}
#[cfg(feature = "networking")]
pub fn new_zip_from_url(path: String) -> Self {
Self::Zip(RwLock::new(Err(path)))
}
#[cfg(feature = "networking")]
pub fn new_tar_from_url(path: String) -> Self {
Self::Tar(RwLock::new(Err(path)))
}
#[cfg(feature = "networking")]
pub fn new_from_url(path: String) -> Self {
// We store the path without trailing / and add them ourselves
Self::Http(path.trim_end_matches('/').to_string())
}
#[cfg(feature = "networking")]
fn download(resource: &String) -> Result<PathBuf, DataError> {
let root = std::env::var_os("ICU4X_SOURCE_CACHE")
.map(PathBuf::from)
.unwrap_or_else(|| std::env::temp_dir().join("icu4x-source-cache/"))
.join(resource.rsplit("//").next().unwrap());
if root.exists() {
return Ok(root);
}
static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
let _one_download_at_a_time = LOCK.lock().unwrap();
if root.exists() {
return Ok(root);
}
log::info!("Downloading {resource}");
std::fs::create_dir_all(root.parent().unwrap())?;
let mut retry = 5;
let mut response = loop {
match ureq::get(resource).call() {
Ok(r) => break r.into_body().into_reader(),
Err(e) if retry > 0 => {
log::warn!("Download error {e:?}, retrying...");
std::thread::sleep(std::time::Duration::from_secs(2));
retry -= 1;
}
Err(e) => return Err(DataError::custom("Download").with_display_context(&e)),
}
};
// Cannot write directly to the final path because we don't want other threads to read the partial file
std::io::copy(
&mut response,
&mut BufWriter::new(File::create(root.with_extension("tmp"))?),
)?;
std::fs::rename(root.with_extension("tmp"), &root)?;
Ok(root)
}
fn init(&self) -> Result<(), DataError> {
#[cfg(feature = "networking")]
if let Self::Zip(lock) = self {
if lock.read().expect("poison").is_ok() {
return Ok(());
}
let mut lock = lock.write().expect("poison");
let resource = if let Err(resource) = &*lock {
resource
} else {
return Ok(());
};
let root = Self::download(resource)?;
*lock =
Ok(ZipData::try_new(std::fs::read(&root)?)
.map_err(|e| e.with_path_context(&root))?);
} else if let Self::Tar(lock) = self {
if lock.read().expect("poison").is_ok() {
return Ok(());
}
let mut lock = lock.write().expect("poison");
let resource = if let Err(resource) = &*lock {
resource
} else {
return Ok(());
};
let root = Self::download(resource)?;
*lock = Ok(TarArchive::try_new(std::fs::read(&root)?)
.map_err(|e| e.with_path_context(&root))?);
}
Ok(())
}
pub fn read_to_buf(&self, path: &str) -> Result<Vec<u8>, DataError> {
self.init()?;
match self {
Self::Fs(root) => {
log::debug!("Reading: {}/{}", root.display(), path);
std::fs::read(root.join(path))
.map_err(|e| DataError::from(e).with_path_context(&root.join(path)))
}
Self::Zip(zip) => {
log::debug!("Reading: <zip>/{path}");
let mut buf = Vec::new();
zip.write()
.expect("poison")
.as_mut()
.ok()
.unwrap() // init called
.archive
.by_name(path)
.map_err(|e| {
DataErrorKind::Io(std::io::ErrorKind::NotFound)
.into_error()
.with_display_context(&e)
.with_display_context(path)
})?
.read_to_end(&mut buf)?;
Ok(buf)
}
Self::Tar(tar) => {
log::debug!("Reading: <tar>/{path}");
tar::Archive::new(Cursor::new(
&tar.read().expect("poison").as_ref().unwrap().archive,
)) // init called
.entries_with_seek()
.and_then(|e| {
for e in e {
let mut e = e?;
if e.path()?.as_os_str() == path {
let mut vec = vec![];
e.read_to_end(&mut vec)?;
return Ok(vec);
}
}
Err(std::io::ErrorKind::NotFound.into())
})
.map_err(|e| {
DataErrorKind::Io(e.kind())
.into_error()
.with_display_context(&e)
.with_display_context(path)
})
}
#[cfg(feature = "networking")]
Self::Http(url) => Ok(std::fs::read(Self::download(&format!("{url}/{path}"))?)?),
Self::Memory(map) => map.get(path).copied().map(Vec::from).ok_or_else(|| {
DataError::custom("Not found in icu4x-datagen's data/").with_display_context(path)
}),
}
}
#[allow(dead_code)]
pub(crate) fn read_to_string(&self, path: &str) -> Result<String, DataError> {
let vec = self.read_to_buf(path)?;
let s = String::from_utf8(vec)
.map_err(|e| DataError::custom("Invalid UTF-8").with_display_context(&e))?;
Ok(s)
}
pub(crate) fn list(&self, path: &str) -> Result<impl Iterator<Item = String>, DataError> {
self.init()?;
Ok(match self {
Self::Fs(root) => std::fs::read_dir(root.join(path))
.map_err(|e| DataError::from(e).with_display_context(path))?
.map(|e| -> Result<_, DataError> { Ok(e?.file_name().into_string().unwrap()) })
.collect::<Result<HashSet<_>, DataError>>()
.map(HashSet::into_iter)?,
Self::Zip(zip) => zip
.read()
.expect("poison")
.as_ref()
.ok()
.unwrap() // init called
.file_list
.iter()
.filter_map(|p| p.strip_prefix(path))
.filter_map(|suffix| suffix.split('/').find(|s| !s.is_empty()))
.map(String::from)
.collect::<HashSet<_>>()
.into_iter(),
Self::Tar(tar) => tar
.read()
.expect("poison")
.as_ref()
.ok()
.unwrap() // init called
.file_list
.iter()
.filter_map(|p| p.strip_prefix(path))
.filter_map(|suffix| suffix.split('/').find(|s| !s.is_empty()))
.map(String::from)
.collect::<HashSet<_>>()
.into_iter(),
#[cfg(feature = "networking")]
Self::Http(url) => {
return Err(
DataError::custom("Cannot list HTTP directories").with_display_context(url)
)
}
Self::Memory(map) => map
.keys()
.copied()
.filter_map(|p| p.strip_prefix(path))
.filter_map(|suffix| suffix.split('/').find(|s| !s.is_empty()))
.map(String::from)
.collect::<HashSet<_>>()
.into_iter(),
})
}
pub fn file_exists(&self, path: &str) -> Result<bool, DataError> {
self.init()?;
Ok(match self {
Self::Fs(root) => root.join(path).is_file(),
Self::Zip(zip) => zip
.read()
.expect("poison")
.as_ref()
.ok()
.unwrap() // init called
.file_list
.contains(path),
Self::Tar(tar) => tar
.read()
.expect("poison")
.as_ref()
.ok()
.unwrap() // init called
.file_list
.contains(path),
#[cfg(feature = "networking")]
Self::Http(url) => Self::download(&format!("{url}/{path}")).is_ok(),
Self::Memory(map) => map.contains_key(path),
})
}
}
#[derive(Debug)]
pub(crate) struct TzdbCache {
pub(crate) root: AbstractFs,
pub(crate) transitions: OnceLock<Result<Tzdb, DataError>>,
}
#[derive(Debug)]
pub(crate) struct Tzdb {
// The main TZDB as defined by main.zi (i.e. `cat africa antarctica asia ...`)
pub(crate) main: parse_zoneinfo::table::Table,
// The TZDB defined by the rearguard.zi file, if present.
// This file requires running `make` in the tzdata directory, which is not
// generally possible. However, it is present in testdata.
pub(crate) rearguard: Option<parse_zoneinfo::table::Table>,
// The TZDB defined by the vanguard.zi file, if present.
// This file requires running `make` in the tzdata directory, which is not
// generally possible. However, it is present in testdata.
pub(crate) vanguard: Option<parse_zoneinfo::table::Table>,
}
impl TzdbCache {
pub(crate) fn new(root: AbstractFs) -> Self {
Self {
root,
transitions: Default::default(),
}
}
pub(crate) fn parsed(&self) -> Result<&Tzdb, DataError> {
self.transitions
.get_or_init(|| {
fn parse(lines: Vec<String>) -> parse_zoneinfo::table::Table {
use parse_zoneinfo::line::Line;
use parse_zoneinfo::table::TableBuilder;
let mut table = TableBuilder::new();
for line in lines {
match Line::new(&line).unwrap() {
Line::Zone(zone) => table.add_zone_line(zone).unwrap(),
Line::Continuation(cont) => table.add_continuation_line(cont).unwrap(),
Line::Rule(rule) => table.add_rule_line(rule).unwrap(),
Line::Link(link) => table.add_link_line(link).unwrap(),
Line::Space => {}
}
}
table.build()
}
Ok(Tzdb {
main: parse(
[
"africa",
"antarctica",
"asia",
"australasia",
"europe",
"northamerica",
"southamerica",
"etcetera",
"factory",
"backward",
]
.into_iter()
.try_fold(Vec::new(), |mut lines, file| {
lines.extend(
self.root
.read_to_string(file)?
.lines()
.map(ToOwned::to_owned),
);
Ok::<_, DataError>(lines)
})?,
),
rearguard: self.root.file_exists("rearguard.zi")?.then(|| {
parse(
self.root
.read_to_string("rearguard.zi")
.unwrap()
.lines()
.map(ToOwned::to_owned)
.collect(),
)
}),
vanguard: self.root.file_exists("vanguard.zi")?.then(|| {
parse(
self.root
.read_to_string("vanguard.zi")
.unwrap()
.lines()
.map(ToOwned::to_owned)
.collect(),
)
}),
})
})
.as_ref()
.map_err(|&e| e)
}
}
// A cache representing https://unicode.org/Public/{version}/
#[derive(Debug)]
pub(crate) struct UnicodeCache {
root: AbstractFs,
// The `ucd/UCD.zip` file. Requests matching `ucd/[^Unihan]` will be resolved through
// the ZIP file instead of downloading individual files.
ucd_zip: Option<AbstractFs>,
// The `ucd/Unihan.zip` file. Requests matching `ucd/Unihan/` will be resolved through
// the ZIP file instead of downloading individual files.
unihan_zip: Option<AbstractFs>,
// The `security/uts39-data-X.0.0.zip`` file. Requests matching `security/` will be
// resolved through the ZIP file instead of downloading individual files.
uts_35_zip: Option<AbstractFs>,
// Cached file contents. It's all text files, so we cache them as strings.
file_cache: FrozenMap<String, String>,
// Cache of enumerated properties CPTs, indexed by short property name.
//
// CPT building is a big bottleneck, so we don't want to build the same one twice.
#[allow(dead_code)]
pub(crate) cpt_cache: FrozenMap<&'static str, Box<dyn Any + Send + Sync>>,
}
impl UnicodeCache {
#[cfg(feature = "networking")]
pub fn new_remote(version: &str) -> Self {
let root = AbstractFs::new_from_url(format!("https://www.unicode.org/Public/{version}/"));
let ucd_zip = AbstractFs::new_zip_from_url(format!(
"https://www.unicode.org/Public/{version}/ucd/UCD.zip"
));
let unihan_zip = AbstractFs::new_zip_from_url(format!(
"https://www.unicode.org/Public/{version}/ucd/Unihan.zip"
));
let uts_35_zip = AbstractFs::new_zip_from_url(format!(
"https://www.unicode.org/Public/{version}/security/uts39-data-{version}.zip"
));
Self {
root,
ucd_zip: Some(ucd_zip),
unihan_zip: Some(unihan_zip),
uts_35_zip: Some(uts_35_zip),
file_cache: FrozenMap::new(),
cpt_cache: FrozenMap::new(),
}
}
pub fn new_local(root: AbstractFs) -> Self {
Self {
root,
ucd_zip: None,
unihan_zip: None,
uts_35_zip: None,
file_cache: FrozenMap::new(),
cpt_cache: FrozenMap::new(),
}
}
#[allow(dead_code)]
pub fn file_exists(&self, file: &str) -> Result<bool, DataError> {
if self.file_cache.get(file).is_some() {
return Ok(true);
}
if let (Some(unihan_zip), Some(unihan_path)) =
(self.unihan_zip.as_ref(), file.strip_prefix("ucd/Unihan/"))
{
Ok(unihan_zip.file_exists(unihan_path)?)
} else if let (Some(ucd_zip), Some(ucd_path)) =
(self.ucd_zip.as_ref(), file.strip_prefix("ucd/"))
{
Ok(ucd_zip.file_exists(ucd_path)?)
} else if let (Some(uts_35_zip), Some(uts_35_path)) =
(self.uts_35_zip.as_ref(), file.strip_prefix("security/"))
{
Ok(uts_35_zip.file_exists(uts_35_path)?)
} else {
Ok(self.root.file_exists(file)?)
}
}
#[allow(dead_code)] // only used with CodePointTrieBuilder, which is feature-gated
pub fn read_to_string(&self, file: &str) -> Result<&str, DataError> {
if let Some(x) = self.file_cache.get(file) {
return Ok(x);
}
if let (Some(unihan_zip), Some(unihan_path)) =
(self.unihan_zip.as_ref(), file.strip_prefix("ucd/Unihan/"))
{
Ok(self
.file_cache
.insert(file.to_string(), unihan_zip.read_to_string(unihan_path)?))
} else if let (Some(ucd_zip), Some(ucd_path)) =
(self.ucd_zip.as_ref(), file.strip_prefix("ucd/"))
{
Ok(self
.file_cache
.insert(file.to_string(), ucd_zip.read_to_string(ucd_path)?))
} else if let (Some(uts_35_zip), Some(uts_35_path)) =
(self.uts_35_zip.as_ref(), file.strip_prefix("security/"))
{
Ok(self
.file_cache
.insert(file.to_string(), uts_35_zip.read_to_string(uts_35_path)?))
} else {
Ok(self
.file_cache
.insert(file.to_string(), self.root.read_to_string(file)?))
}
}
}
macro_rules! include_files {
($base:literal; $($file:literal),* $(,)?) => {
crate::source::AbstractFs::Memory([
$(
($file, include_bytes!(concat!($base, $file)).as_slice()),
)*
].into_iter().collect())
};
}
pub(crate) use include_files;