Browse Source

implements --filter-crates, --dry-run, --overwrite-existing

--filter-crates: limit downloads to crates with names matching supplied regex
--dry-run: don't actually download
--overwrite-existing: re-download even if .crate file already exists in output dir
master
Jonathan Strong 2 years ago
parent
commit
b7621e78ac
  1. 1
      .gitignore
  2. 25
      Cargo.lock
  3. 3
      Cargo.toml
  4. 15
      config.toml.sample
  5. 43
      doc/cli-menu.txt
  6. 1
      doc/just-commands.txt
  7. 5
      justfile
  8. 268
      src/main.rs

1
.gitignore vendored

@ -1,3 +1,4 @@
/target
*.swp
config.toml
/output

25
Cargo.lock generated

@ -75,6 +75,17 @@ dependencies = [
"syn",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@ -199,11 +210,13 @@ version = "3.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b71c3ce99b7611011217b366d923f1d0a7e07a92bb2dbf1e84508c673ca3bd"
dependencies = [
"atty",
"bitflags",
"clap_derive",
"clap_lex",
"indexmap",
"once_cell",
"termcolor",
"textwrap",
]
@ -1310,7 +1323,7 @@ checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
[[package]]
name = "registry-backup"
version = "0.2.4"
version = "0.3.0"
dependencies = [
"chrono",
"clap",
@ -1318,6 +1331,7 @@ dependencies = [
"governor",
"num_cpus",
"pretty_toa",
"regex",
"reqwest",
"serde",
"serde_json",
@ -1623,6 +1637,15 @@ dependencies = [
"unic-segment",
]
[[package]]
name = "termcolor"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
dependencies = [
"winapi-util",
]
[[package]]
name = "textwrap"
version = "0.15.0"

3
Cargo.toml

@ -1,7 +1,7 @@
[package]
name = "registry-backup"
authors = ["Jonathan Strong <jstrong@shipyard.rs>"]
version = "0.2.4"
version = "0.3.0"
edition = "2021"
publish = ["shipyard-rs-public"]
readme = "README.md"
@ -37,6 +37,7 @@ governor = "0.4.2"
pretty_toa = "1"
tera = { version = "1", optional = true }
chrono = { version = "0.4", optional = true }
regex = "1.6"
[features]
default = []

15
config.toml.sample

@ -1,16 +1,17 @@
# index repo url
index-url = "ssh://git@ssh.shipyard.rs/shipyard-rs-public/crate-index.git"
dry-run = false
filter-crates = "^."
[registry]
index-url = "ssh://git@ssh.shipyard.rs/shipyard-rs-public/crate-index.git"
# alternatively, specify a local dir
# index-path = "/path/to/cloned/index"
auth-token = "xxx"
[http]
user-agent = "registry-backup/v0.1.0"
requests-per-second = 40
max-concurrent-requests = 20
requests-per-second = 100
max-concurrent-requests = 50
[output]
path = "output"
format = "/{crate}/{version}/download"
# auth-token = "xxx"
overwrite-existing = false

43
doc/cli-menu.txt

@ -1,4 +1,4 @@
registry-backup 0.2.4
registry-backup 0.3.0
Jonathan Strong <jstrong@shipyard.rs>
Download all .crate files from a registry server
@ -6,36 +6,49 @@ USAGE:
registry-backup [OPTIONS]
OPTIONS:
--index-url <INDEX_URL>
--index-url <URL>
URL of the registry index we are downloading .crate files from. The program expects that
it will be able to clone the index to a local temporary directory; the user must handle
authentication if needed
--index-path <INDEX_PATH>
--index-path <PATH>
instead of an index url, just point to a local path where the index is already cloned
-p, --output-path <PATH>
-a, --auth-token <TOKEN>
If registry requires authorization (i.e. "auth-required" key is set to `true` in the
`config.json` file), the token to include using the Authorization HTTP header
-o, --output-path <PATH>
Directory where downloaded .crate files will be saved to [default: output]
-u, --user-agent <USER_AGENT>
Value of user-agent HTTP header [default: registry-backup/v0.2.4]
-U, --user-agent <USER_AGENT>
Value of user-agent HTTP header [default: registry-backup/v0.3.0]
--requests-per-second <REQUESTS_PER_SECOND>
Requests to registry server will not exceed this rate [default: 25]
-R, --requests-per-second <INT>
Requests to registry server will not exceed this rate [default: 100]
--max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
-M, --max-concurrent-requests <INT>
Independent of the requests per second rate limit, no more than
`max_concurrent_requests` will be in flight at any given moment [default: 10]
`max_concurrent_requests` will be in flight at any given moment [default: 50]
-a, --auth-token <AUTH_TOKEN>
If registry requires authorization (i.e. "auth-required" key is set to `true` in the
`config.json` file), the token to include using the Authorization HTTP header
-c, --config-file <CONFIG_FILE>
-c, --config-file <PATH>
Specify configuration values using the provided TOML file, instead of via command line
flags. The values in the config file will override any values passed as command line
flags. See config.toml.sample for syntax of the config file
--filter-crates <REGEX>
Only crates with names that match --filter-crate regex will be downloaded
--overwrite-existing
Download files when if .crate file already exists in output dir for a given crate
version, and overwrite the existing file with the new one. Default behavior is to skip
downloading if .crate file already exists
--dry-run
Don't actually download the .crate files, just list files which would be downloaded.
Note: --requests-per-second and --max-concurrent-requests are still enforced even in
--dry-mode!
-h, --help
Print help information

1
doc/just-commands.txt

@ -4,6 +4,7 @@ Available recipes:
debug-build +args='' # cargo build wrapper - builds registry-backup in debug mode
generate-readme # generate updated README.md
get-crate-version
install # cargo install registry-backup via git dep
pre-release # check, run tests, check non-error output for clippy, run rustfmt
release # release version (regenerate docs, git tag v0.0.0)
release-build +args='' # cargo build --release wrapper - builds registry-backup in release mode

5
justfile

@ -1,6 +1,7 @@
set dotenv-load := true
rustc-version := "nightly"
publish-registry := "shipyard-rs-public"
repository := "https://git.shipyard.rs/jstrong/registry-backup"
# complicated/ugly one-liner to use lld linker if it's available
export RUSTFLAGS := `LLD=$(which lld) && test $? -eq "0" && echo "-C link-arg=-fuse-ld=lld" || echo ''`
@ -75,3 +76,7 @@ show-build-env:
@echo "rustc-version={{rustc-version}}"
@echo "publish-registry={{publish-registry}}"
@env | rg RUST --color never
# cargo install registry-backup via git dep
install:
just cargo install registry-backup --git {{repository}}

268
src/main.rs

@ -2,6 +2,10 @@ use std::num::NonZeroU32;
use std::path::{Path, PathBuf};
use std::process::Output;
use std::str::from_utf8;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
use std::time::*;
use clap::Parser;
@ -52,9 +56,16 @@ pub struct CrateVersion {
#[serde(rename_all = "kebab-case")]
pub struct OutputConfig {
/// Directory where downloaded .crate files will be saved to.
#[clap(short, long = "output-path", default_value = DEFAULT_OUTPUT_PATH)]
#[clap(short = 'o', long = "output-path", default_value = DEFAULT_OUTPUT_PATH)]
#[serde(default = "default_output_path")]
pub path: PathBuf,
/// Download files when if .crate file already exists in output dir for a
/// given crate version, and overwrite the existing file with the new one.
/// Default behavior is to skip downloading if .crate file already exists.
#[serde(default)]
#[clap(long)]
pub overwrite_existing: bool,
// /// What format to use for the output filenames. Works the same as
// /// Cargo's registry syntax for the "dl" key in the `config.json`
// /// file in a reigstry index. See [Cargo
@ -70,60 +81,95 @@ pub struct OutputConfig {
// pub mirror_registry_format: bool,
}
#[derive(Deserialize, Debug, Parser)]
#[derive(Deserialize, Parser)]
#[serde(rename_all = "kebab-case")]
pub struct HttpConfig {
/// Value of user-agent HTTP header
#[serde(default = "default_user_agent")]
#[clap(short, long, default_value = DEFAULT_USER_AGENT)]
#[clap(short = 'U', long, default_value = DEFAULT_USER_AGENT)]
pub user_agent: String,
/// Requests to registry server will not exceed this rate
#[serde(default = "default_requests_per_second")]
#[clap(long, default_value_t = default_requests_per_second())]
#[clap(short = 'R', long, default_value_t = default_requests_per_second())]
#[clap(value_name = "INT")]
pub requests_per_second: NonZeroU32,
/// Independent of the requests per second rate limit, no more
/// than `max_concurrent_requests` will be in flight at any given
/// moment.
#[serde(default = "default_max_concurrent_requests")]
#[clap(long, default_value_t = default_max_concurrent_requests())]
#[clap(short = 'M', long, default_value_t = default_max_concurrent_requests())]
#[clap(value_name = "INT")]
#[clap(alias = "max-concurrency", alias = "concurrency")]
#[serde(alias = "max-concurrency", alias = "concurrency")]
pub max_concurrent_requests: NonZeroU32,
}
/// Download all .crate files from a registry server.
#[derive(Deserialize, Parser)]
#[serde(rename_all = "kebab-case")]
#[clap(author, version, global_setting(clap::AppSettings::DeriveDisplayOrder))]
pub struct Config {
pub struct TargetRegistryConfig {
/// URL of the registry index we are downloading .crate files from. The
/// program expects that it will be able to clone the index to a local
/// temporary directory; the user must handle authentication if needed.
#[serde(default)]
#[clap(long)]
#[serde(default, alias = "registry-path")]
#[clap(long, alias = "registry-url", value_name = "URL")]
pub index_url: Option<String>,
/// instead of an index url, just point to a local path where the index
/// is already cloned.
#[serde(default)]
#[clap(long, conflicts_with = "index-url")]
#[serde(default, alias = "registry-path")]
#[clap(long, conflicts_with = "index-url", alias = "registry-path")]
#[clap(value_name = "PATH")]
pub index_path: Option<PathBuf>,
/// If registry requires authorization (i.e. "auth-required" key is
/// set to `true` in the `config.json` file), the token to include
/// using the Authorization HTTP header.
#[clap(short, long, alias = "token", value_name = "TOKEN")]
#[serde(default)]
pub auth_token: Option<String>,
}
/// Download all .crate files from a registry server.
#[derive(Deserialize, Parser, Debug)]
#[serde(rename_all = "kebab-case")]
#[clap(author, version, global_setting(clap::AppSettings::DeriveDisplayOrder))]
pub struct Config {
/// Crate registry location and authentication
#[clap(flatten)]
pub registry: TargetRegistryConfig,
/// Where to save the downloaded files
#[clap(flatten)]
pub output: OutputConfig,
/// Download settings
#[clap(flatten)]
pub http: HttpConfig,
/// If registry requires authorization (i.e. "auth-required" key is
/// set to `true` in the `config.json` file), the token to include
/// using the Authorization HTTP header.
#[clap(short, long, alias = "token")]
#[serde(default)]
pub auth_token: Option<String>,
/// Specify configuration values using the provided TOML file, instead of
/// via command line flags. The values in the config file will override
/// any values passed as command line flags. See config.toml.sample for
/// syntax of the config file.
#[serde(default)]
#[clap(short, long, exclusive(true))]
#[clap(short, long, value_name = "PATH")]
#[clap(conflicts_with_all(&[
"index-url",
"index-path",
"auth-token",
"path",
"user-agent",
"requests-per-second",
"max-concurrent-requests",
"overwrite-existing",
][..]))]
pub config_file: Option<PathBuf>,
/// Only crates with names that match --filter-crate regex will be downloaded
#[serde(default)]
#[clap(long, value_name = "REGEX", alias = "filter")]
pub filter_crates: Option<String>,
/// Don't actually download the .crate files, just list files which would be
/// downloaded. Note: --requests-per-second and --max-concurrent-requests are
/// still enforced even in --dry-mode!
#[serde(default)]
#[clap(long)]
pub dry_run: bool,
}
const DEFAULT_OUTPUT_PATH: &str = "output";
@ -140,22 +186,55 @@ fn default_user_agent() -> String {
}
const fn default_requests_per_second() -> NonZeroU32 {
unsafe { NonZeroU32::new_unchecked(25) }
unsafe { NonZeroU32::new_unchecked(100) }
}
const fn default_max_concurrent_requests() -> NonZeroU32 {
unsafe { NonZeroU32::new_unchecked(10) }
unsafe { NonZeroU32::new_unchecked(50) }
}
impl Config {
pub fn skip_existing(&self) -> bool {
!self.output.overwrite_existing
}
pub fn compile_filter(&self) -> Result<Option<regex::Regex>, AnyError> {
match self.filter_crates.as_ref() {
Some(regex) => {
let compiled = regex::Regex::new(regex).map_err(|e| {
error!(%regex, err = ?e, "regex failed to compile: {}", e);
e
})?;
Ok(Some(compiled))
}
None => Ok(None),
}
}
}
impl std::fmt::Debug for Config {
impl std::fmt::Debug for TargetRegistryConfig {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("Config")
.field("index_url", &self.index_url)
.field("index_path", &self.index_path)
.field("output", &self.output)
.field("http", &self.http)
.field("auth_token", &"***") // hide sensitive data
.field("config_file", &self.config_file)
.finish()
}
}
impl std::fmt::Debug for HttpConfig {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("Config")
.field(
"user_agent",
if self.user_agent.starts_with("shipyard ") {
&"shipyard ***"
} else {
&self.user_agent
},
)
.field("requests_per_second", &self.requests_per_second)
.field("max_concurrent_requests", &self.max_concurrent_requests)
.finish()
}
}
@ -265,7 +344,14 @@ fn is_hidden(entry: &walkdir::DirEntry) -> bool {
.unwrap_or(false)
}
async fn get_crate_versions(clone_dir: &Path) -> Result<Vec<CrateVersion>, AnyError> {
async fn get_crate_versions(
config: &Config,
clone_dir: &Path,
) -> Result<Vec<CrateVersion>, AnyError> {
let filter = config.compile_filter()?;
let mut n_excl = 0;
let n_existing = Arc::new(AtomicUsize::new(0));
let files: Vec<PathBuf> = walkdir::WalkDir::new(clone_dir)
.max_depth(3)
.into_iter()
@ -274,6 +360,15 @@ async fn get_crate_versions(clone_dir: &Path) -> Result<Vec<CrateVersion>, AnyEr
Ok(entry) => {
if entry.file_type().is_file() && entry.depth() >= 2 && entry.depth() <= 3 {
let path = entry.into_path();
if let Some(filter) = filter.as_ref() {
let crate_name = path.file_name().and_then(|x| x.to_str()).unwrap_or("");
if !filter.is_match(crate_name.as_ref()) {
n_excl += 1;
return None;
}
}
debug!(?path, "found crate metadata file to parse");
Some(path)
} else {
@ -287,34 +382,66 @@ async fn get_crate_versions(clone_dir: &Path) -> Result<Vec<CrateVersion>, AnyEr
})
.collect();
info!("found {} crate metadata files to parse", files.len());
let n_files = files.len();
info!("found {} crate metadata files to parse", n_files);
if n_excl > 0 {
warn!(
regex = %config.filter_crates.as_deref().unwrap_or(""),
n_files,
n_excl,
"--filter excluded {} crates", n_excl,
);
}
let crate_versions: Vec<Result<Vec<CrateVersion>, AnyError>> =
futures::stream::iter(files.into_iter().map(|path| async move {
let file = tokio::fs::File::open(&path).await.map_err(|e| {
error!(err = ?e, ?path, "failed to open file");
e
})?;
let buf = tokio::io::BufReader::new(file);
let mut out = Vec::new();
let mut lines = buf.lines();
while let Some(line) = lines.next_line().await? {
let vers: CrateVersion = serde_json::from_str(&line).map_err(|e| {
error!(err = ?e, ?path, "failed to parse line");
futures::stream::iter(files.into_iter().map(|path| {
let n_existing = n_existing.clone();
async move {
let file = tokio::fs::File::open(&path).await.map_err(|e| {
error!(err = ?e, ?path, "failed to open file");
e
})?;
out.push(vers);
}
debug!(crate_name = %out.first().map(|x| x.name.as_str()).unwrap_or("na"),
"parsed {} crate versions from metadata file", out.len()
);
let buf = tokio::io::BufReader::new(file);
let mut out = Vec::new();
let mut lines = buf.lines();
'lines: while let Some(line) = lines.next_line().await? {
let vers: CrateVersion = serde_json::from_str(&line).map_err(|e| {
error!(err = ?e, ?path, "failed to parse line");
e
})?;
Ok(out)
if config.skip_existing() {
let vers_path = format!("{}/{}/download", vers.name, vers.vers);
let output_path = config.output.path.join(vers_path);
if output_path.exists() {
n_existing.fetch_add(1, Ordering::Relaxed);
continue 'lines;
}
}
out.push(vers);
}
debug!(crate_name = %out.first().map(|x| x.name.as_str()).unwrap_or("na"),
"parsed {} crate versions from metadata file", out.len()
);
Ok(out)
}
}))
.buffer_unordered(num_cpus::get())
.collect()
.await;
let n_existing = n_existing.load(Ordering::Relaxed);
if n_existing > 0 {
warn!(
"skipped {} crate versions that were previously downloaded",
n_existing,
);
}
let crate_versions: Vec<CrateVersion> = crate_versions
.into_iter()
.flat_map(|result| match result {
@ -327,6 +454,10 @@ async fn get_crate_versions(clone_dir: &Path) -> Result<Vec<CrateVersion>, AnyEr
.collect();
info!(
n_files,
n_excl,
n_existing,
n_download_targets = crate_versions.len(),
"collected {} total crate versions to download",
crate_versions.len()
);
@ -392,20 +523,35 @@ async fn download_versions(
.user_agent(&config.http.user_agent)
.build()?;
info!(
reqs_per_sec = config.http.requests_per_second,
max_concurrency = config.http.max_concurrent_requests,
"downloading crates at {} reqs/sec",
config.http.requests_per_second,
);
let inner_stream = futures::stream::iter(versions.into_iter().map(|vers| {
let req_begin = Instant::now();
let http_client = http_client.clone();
async move {
// TODO actually parse and use the format
let vers_path = format!("{}/{}/download", vers.name, vers.vers);
let url = format!("{}/{}", registry_config.dl, vers_path);
let output_path = config.output.path.join(vers_path);
if config.dry_run {
debug!(%url, "skipping download (--dry-run mode)");
return Ok(None);
}
debug!(?url, "downloading...");
let req = http_client
.get(url)
.header(CONTENT_TYPE, "application/json")
.header(ACCEPT, "application/json");
let req = if let Some(token) = config.auth_token.as_deref() {
let req = if let Some(token) = config.registry.auth_token.as_deref() {
req.header(AUTHORIZATION, token)
} else {
req
@ -421,7 +567,6 @@ async fn download_versions(
Err::<_, AnyError>(format!("error response {:?} from server", status).into())
} else {
// TODO: check if this path exists already before downloading
let output_path = config.output.path.join(vers_path);
ensure_file_parent_dir_exists(&output_path)
.await
.map_err(|e| {
@ -440,7 +585,7 @@ async fn download_versions(
version = %vers.vers,
"downloaded .crate file in {:?}", req_begin.elapsed());
debug!(?output_path, "wrote {} bytes to file", body.len());
Ok(output_path)
Ok(Some(output_path))
}
}
}))
@ -448,24 +593,32 @@ async fn download_versions(
let outer_stream = inner_stream.ratelimit_stream(&rate_limit);
let results: Vec<Result<PathBuf, AnyError>> = outer_stream.collect().await;
let results: Vec<Result<Option<PathBuf>, AnyError>> = outer_stream.collect().await;
let mut ret = Ok(());
let n = results.len();
let mut n_err = 0;
let mut n_skip = 0;
for result in results {
if let Err(e) = result {
n_err += 1;
error!(err = ?e, "download failed");
ret = Err(e);
match result {
Ok(None) => n_skip += 1,
Err(e) => {
n_err += 1;
error!(err = ?e, "download failed");
ret = Err(e);
}
_ => {}
}
}
let n_ok = n - n_err;
let n_ok = n - n_err - n_skip;
info!(
n_ok,
n_err,
n_skip,
"finished downloading {} files in {:?}",
n_ok,
begin.elapsed()
@ -479,13 +632,16 @@ async fn run(config: Config) -> Result<(), AnyError> {
debug!("config:\n{:#?}\n", config);
assert!(
config.index_url.is_some() || config.index_path.is_some(),
config.registry.index_url.is_some() || config.registry.index_path.is_some(),
"one of index-url or index-path is required",
);
// verify regex compiles
let _ = config.compile_filter()?;
let tmpdir = tempdir::TempDir::new("registry-backup-index")?;
let index_path = match (&config.index_url, &config.index_path) {
let index_path = match (&config.registry.index_url, &config.registry.index_path) {
(Some(url), _) => {
let tmp = tmpdir.path();
git_clone(url, tmp, &[][..]).await?;
@ -499,7 +655,7 @@ async fn run(config: Config) -> Result<(), AnyError> {
let registry_config = load_registry_config(index_path).await?;
let versions = get_crate_versions(index_path).await?;
let versions = get_crate_versions(&config, index_path).await?;
download_versions(&config, &registry_config, versions).await?;

Loading…
Cancel
Save