diff --git a/.gitignore b/.gitignore index 1d82798..63118b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target *.swp config.toml +/output diff --git a/Cargo.lock b/Cargo.lock index abaeb91..cff21fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,6 +75,17 @@ dependencies = [ "syn", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -199,11 +210,13 @@ version = "3.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23b71c3ce99b7611011217b366d923f1d0a7e07a92bb2dbf1e84508c673ca3bd" dependencies = [ + "atty", "bitflags", "clap_derive", "clap_lex", "indexmap", "once_cell", + "termcolor", "textwrap", ] @@ -1310,7 +1323,7 @@ checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "registry-backup" -version = "0.2.4" +version = "0.3.0" dependencies = [ "chrono", "clap", @@ -1318,6 +1331,7 @@ dependencies = [ "governor", "num_cpus", "pretty_toa", + "regex", "reqwest", "serde", "serde_json", @@ -1623,6 +1637,15 @@ dependencies = [ "unic-segment", ] +[[package]] +name = "termcolor" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" +dependencies = [ + "winapi-util", +] + [[package]] name = "textwrap" version = "0.15.0" diff --git a/Cargo.toml b/Cargo.toml index f399b2c..8869e78 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "registry-backup" authors = ["Jonathan Strong "] -version = "0.2.4" +version = "0.3.0" edition = "2021" publish = ["shipyard-rs-public"] readme = "README.md" @@ -37,6 +37,7 @@ governor = "0.4.2" pretty_toa = "1" tera = { version = "1", optional = true } chrono = { version = "0.4", optional = true } +regex = "1.6" [features] default = [] diff --git a/config.toml.sample b/config.toml.sample index c3d7fab..10241ae 100644 --- a/config.toml.sample +++ b/config.toml.sample @@ -1,16 +1,17 @@ -# index repo url -index-url = "ssh://git@ssh.shipyard.rs/shipyard-rs-public/crate-index.git" +dry-run = false +filter-crates = "^." +[registry] +index-url = "ssh://git@ssh.shipyard.rs/shipyard-rs-public/crate-index.git" # alternatively, specify a local dir # index-path = "/path/to/cloned/index" +auth-token = "xxx" [http] user-agent = "registry-backup/v0.1.0" -requests-per-second = 40 -max-concurrent-requests = 20 +requests-per-second = 100 +max-concurrent-requests = 50 [output] path = "output" -format = "/{crate}/{version}/download" - -# auth-token = "xxx" +overwrite-existing = false diff --git a/doc/cli-menu.txt b/doc/cli-menu.txt index 01d2120..48d9e29 100644 --- a/doc/cli-menu.txt +++ b/doc/cli-menu.txt @@ -1,4 +1,4 @@ -registry-backup 0.2.4 +registry-backup 0.3.0 Jonathan Strong Download all .crate files from a registry server @@ -6,36 +6,49 @@ USAGE: registry-backup [OPTIONS] OPTIONS: - --index-url + --index-url URL of the registry index we are downloading .crate files from. The program expects that it will be able to clone the index to a local temporary directory; the user must handle authentication if needed - --index-path + --index-path instead of an index url, just point to a local path where the index is already cloned - -p, --output-path + -a, --auth-token + If registry requires authorization (i.e. "auth-required" key is set to `true` in the + `config.json` file), the token to include using the Authorization HTTP header + + -o, --output-path Directory where downloaded .crate files will be saved to [default: output] - -u, --user-agent - Value of user-agent HTTP header [default: registry-backup/v0.2.4] + -U, --user-agent + Value of user-agent HTTP header [default: registry-backup/v0.3.0] - --requests-per-second - Requests to registry server will not exceed this rate [default: 25] + -R, --requests-per-second + Requests to registry server will not exceed this rate [default: 100] - --max-concurrent-requests + -M, --max-concurrent-requests Independent of the requests per second rate limit, no more than - `max_concurrent_requests` will be in flight at any given moment [default: 10] + `max_concurrent_requests` will be in flight at any given moment [default: 50] - -a, --auth-token - If registry requires authorization (i.e. "auth-required" key is set to `true` in the - `config.json` file), the token to include using the Authorization HTTP header - - -c, --config-file + -c, --config-file Specify configuration values using the provided TOML file, instead of via command line flags. The values in the config file will override any values passed as command line flags. See config.toml.sample for syntax of the config file + --filter-crates + Only crates with names that match --filter-crate regex will be downloaded + + --overwrite-existing + Download files when if .crate file already exists in output dir for a given crate + version, and overwrite the existing file with the new one. Default behavior is to skip + downloading if .crate file already exists + + --dry-run + Don't actually download the .crate files, just list files which would be downloaded. + Note: --requests-per-second and --max-concurrent-requests are still enforced even in + --dry-mode! + -h, --help Print help information diff --git a/doc/just-commands.txt b/doc/just-commands.txt index 971e71d..a097138 100644 --- a/doc/just-commands.txt +++ b/doc/just-commands.txt @@ -4,6 +4,7 @@ Available recipes: debug-build +args='' # cargo build wrapper - builds registry-backup in debug mode generate-readme # generate updated README.md get-crate-version + install # cargo install registry-backup via git dep pre-release # check, run tests, check non-error output for clippy, run rustfmt release # release version (regenerate docs, git tag v0.0.0) release-build +args='' # cargo build --release wrapper - builds registry-backup in release mode diff --git a/justfile b/justfile index 7939000..d1fcf9d 100644 --- a/justfile +++ b/justfile @@ -1,6 +1,7 @@ set dotenv-load := true rustc-version := "nightly" publish-registry := "shipyard-rs-public" +repository := "https://git.shipyard.rs/jstrong/registry-backup" # complicated/ugly one-liner to use lld linker if it's available export RUSTFLAGS := `LLD=$(which lld) && test $? -eq "0" && echo "-C link-arg=-fuse-ld=lld" || echo ''` @@ -75,3 +76,7 @@ show-build-env: @echo "rustc-version={{rustc-version}}" @echo "publish-registry={{publish-registry}}" @env | rg RUST --color never + +# cargo install registry-backup via git dep +install: + just cargo install registry-backup --git {{repository}} diff --git a/src/main.rs b/src/main.rs index 6950209..ca569b3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,10 @@ use std::num::NonZeroU32; use std::path::{Path, PathBuf}; use std::process::Output; use std::str::from_utf8; +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; use std::time::*; use clap::Parser; @@ -52,9 +56,16 @@ pub struct CrateVersion { #[serde(rename_all = "kebab-case")] pub struct OutputConfig { /// Directory where downloaded .crate files will be saved to. - #[clap(short, long = "output-path", default_value = DEFAULT_OUTPUT_PATH)] + #[clap(short = 'o', long = "output-path", default_value = DEFAULT_OUTPUT_PATH)] #[serde(default = "default_output_path")] pub path: PathBuf, + + /// Download files when if .crate file already exists in output dir for a + /// given crate version, and overwrite the existing file with the new one. + /// Default behavior is to skip downloading if .crate file already exists. + #[serde(default)] + #[clap(long)] + pub overwrite_existing: bool, // /// What format to use for the output filenames. Works the same as // /// Cargo's registry syntax for the "dl" key in the `config.json` // /// file in a reigstry index. See [Cargo @@ -70,60 +81,95 @@ pub struct OutputConfig { // pub mirror_registry_format: bool, } -#[derive(Deserialize, Debug, Parser)] +#[derive(Deserialize, Parser)] #[serde(rename_all = "kebab-case")] pub struct HttpConfig { /// Value of user-agent HTTP header #[serde(default = "default_user_agent")] - #[clap(short, long, default_value = DEFAULT_USER_AGENT)] + #[clap(short = 'U', long, default_value = DEFAULT_USER_AGENT)] pub user_agent: String, /// Requests to registry server will not exceed this rate #[serde(default = "default_requests_per_second")] - #[clap(long, default_value_t = default_requests_per_second())] + #[clap(short = 'R', long, default_value_t = default_requests_per_second())] + #[clap(value_name = "INT")] pub requests_per_second: NonZeroU32, /// Independent of the requests per second rate limit, no more /// than `max_concurrent_requests` will be in flight at any given /// moment. #[serde(default = "default_max_concurrent_requests")] - #[clap(long, default_value_t = default_max_concurrent_requests())] + #[clap(short = 'M', long, default_value_t = default_max_concurrent_requests())] + #[clap(value_name = "INT")] + #[clap(alias = "max-concurrency", alias = "concurrency")] + #[serde(alias = "max-concurrency", alias = "concurrency")] pub max_concurrent_requests: NonZeroU32, } -/// Download all .crate files from a registry server. #[derive(Deserialize, Parser)] #[serde(rename_all = "kebab-case")] -#[clap(author, version, global_setting(clap::AppSettings::DeriveDisplayOrder))] -pub struct Config { +pub struct TargetRegistryConfig { /// URL of the registry index we are downloading .crate files from. The /// program expects that it will be able to clone the index to a local /// temporary directory; the user must handle authentication if needed. - #[serde(default)] - #[clap(long)] + #[serde(default, alias = "registry-path")] + #[clap(long, alias = "registry-url", value_name = "URL")] pub index_url: Option, /// instead of an index url, just point to a local path where the index /// is already cloned. - #[serde(default)] - #[clap(long, conflicts_with = "index-url")] + #[serde(default, alias = "registry-path")] + #[clap(long, conflicts_with = "index-url", alias = "registry-path")] + #[clap(value_name = "PATH")] pub index_path: Option, + /// If registry requires authorization (i.e. "auth-required" key is + /// set to `true` in the `config.json` file), the token to include + /// using the Authorization HTTP header. + #[clap(short, long, alias = "token", value_name = "TOKEN")] + #[serde(default)] + pub auth_token: Option, +} + +/// Download all .crate files from a registry server. +#[derive(Deserialize, Parser, Debug)] +#[serde(rename_all = "kebab-case")] +#[clap(author, version, global_setting(clap::AppSettings::DeriveDisplayOrder))] +pub struct Config { + /// Crate registry location and authentication + #[clap(flatten)] + pub registry: TargetRegistryConfig, /// Where to save the downloaded files #[clap(flatten)] pub output: OutputConfig, /// Download settings #[clap(flatten)] pub http: HttpConfig, - /// If registry requires authorization (i.e. "auth-required" key is - /// set to `true` in the `config.json` file), the token to include - /// using the Authorization HTTP header. - #[clap(short, long, alias = "token")] - #[serde(default)] - pub auth_token: Option, /// Specify configuration values using the provided TOML file, instead of /// via command line flags. The values in the config file will override /// any values passed as command line flags. See config.toml.sample for /// syntax of the config file. #[serde(default)] - #[clap(short, long, exclusive(true))] + #[clap(short, long, value_name = "PATH")] + #[clap(conflicts_with_all(&[ + "index-url", + "index-path", + "auth-token", + "path", + "user-agent", + "requests-per-second", + "max-concurrent-requests", + "overwrite-existing", + ][..]))] pub config_file: Option, + + /// Only crates with names that match --filter-crate regex will be downloaded + #[serde(default)] + #[clap(long, value_name = "REGEX", alias = "filter")] + pub filter_crates: Option, + + /// Don't actually download the .crate files, just list files which would be + /// downloaded. Note: --requests-per-second and --max-concurrent-requests are + /// still enforced even in --dry-mode! + #[serde(default)] + #[clap(long)] + pub dry_run: bool, } const DEFAULT_OUTPUT_PATH: &str = "output"; @@ -140,22 +186,55 @@ fn default_user_agent() -> String { } const fn default_requests_per_second() -> NonZeroU32 { - unsafe { NonZeroU32::new_unchecked(25) } + unsafe { NonZeroU32::new_unchecked(100) } } const fn default_max_concurrent_requests() -> NonZeroU32 { - unsafe { NonZeroU32::new_unchecked(10) } + unsafe { NonZeroU32::new_unchecked(50) } +} + +impl Config { + pub fn skip_existing(&self) -> bool { + !self.output.overwrite_existing + } + + pub fn compile_filter(&self) -> Result, AnyError> { + match self.filter_crates.as_ref() { + Some(regex) => { + let compiled = regex::Regex::new(regex).map_err(|e| { + error!(%regex, err = ?e, "regex failed to compile: {}", e); + e + })?; + Ok(Some(compiled)) + } + None => Ok(None), + } + } } -impl std::fmt::Debug for Config { +impl std::fmt::Debug for TargetRegistryConfig { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { f.debug_struct("Config") .field("index_url", &self.index_url) .field("index_path", &self.index_path) - .field("output", &self.output) - .field("http", &self.http) .field("auth_token", &"***") // hide sensitive data - .field("config_file", &self.config_file) + .finish() + } +} + +impl std::fmt::Debug for HttpConfig { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Config") + .field( + "user_agent", + if self.user_agent.starts_with("shipyard ") { + &"shipyard ***" + } else { + &self.user_agent + }, + ) + .field("requests_per_second", &self.requests_per_second) + .field("max_concurrent_requests", &self.max_concurrent_requests) .finish() } } @@ -265,7 +344,14 @@ fn is_hidden(entry: &walkdir::DirEntry) -> bool { .unwrap_or(false) } -async fn get_crate_versions(clone_dir: &Path) -> Result, AnyError> { +async fn get_crate_versions( + config: &Config, + clone_dir: &Path, +) -> Result, AnyError> { + let filter = config.compile_filter()?; + let mut n_excl = 0; + let n_existing = Arc::new(AtomicUsize::new(0)); + let files: Vec = walkdir::WalkDir::new(clone_dir) .max_depth(3) .into_iter() @@ -274,6 +360,15 @@ async fn get_crate_versions(clone_dir: &Path) -> Result, AnyEr Ok(entry) => { if entry.file_type().is_file() && entry.depth() >= 2 && entry.depth() <= 3 { let path = entry.into_path(); + + if let Some(filter) = filter.as_ref() { + let crate_name = path.file_name().and_then(|x| x.to_str()).unwrap_or(""); + if !filter.is_match(crate_name.as_ref()) { + n_excl += 1; + return None; + } + } + debug!(?path, "found crate metadata file to parse"); Some(path) } else { @@ -287,34 +382,66 @@ async fn get_crate_versions(clone_dir: &Path) -> Result, AnyEr }) .collect(); - info!("found {} crate metadata files to parse", files.len()); + let n_files = files.len(); + info!("found {} crate metadata files to parse", n_files); + + if n_excl > 0 { + warn!( + regex = %config.filter_crates.as_deref().unwrap_or(""), + n_files, + n_excl, + "--filter excluded {} crates", n_excl, + ); + } let crate_versions: Vec, AnyError>> = - futures::stream::iter(files.into_iter().map(|path| async move { - let file = tokio::fs::File::open(&path).await.map_err(|e| { - error!(err = ?e, ?path, "failed to open file"); - e - })?; - let buf = tokio::io::BufReader::new(file); - let mut out = Vec::new(); - let mut lines = buf.lines(); - while let Some(line) = lines.next_line().await? { - let vers: CrateVersion = serde_json::from_str(&line).map_err(|e| { - error!(err = ?e, ?path, "failed to parse line"); + futures::stream::iter(files.into_iter().map(|path| { + let n_existing = n_existing.clone(); + async move { + let file = tokio::fs::File::open(&path).await.map_err(|e| { + error!(err = ?e, ?path, "failed to open file"); e })?; - out.push(vers); - } - debug!(crate_name = %out.first().map(|x| x.name.as_str()).unwrap_or("na"), - "parsed {} crate versions from metadata file", out.len() - ); + let buf = tokio::io::BufReader::new(file); + let mut out = Vec::new(); + let mut lines = buf.lines(); + 'lines: while let Some(line) = lines.next_line().await? { + let vers: CrateVersion = serde_json::from_str(&line).map_err(|e| { + error!(err = ?e, ?path, "failed to parse line"); + e + })?; - Ok(out) + if config.skip_existing() { + let vers_path = format!("{}/{}/download", vers.name, vers.vers); + let output_path = config.output.path.join(vers_path); + if output_path.exists() { + n_existing.fetch_add(1, Ordering::Relaxed); + continue 'lines; + } + } + + out.push(vers); + } + debug!(crate_name = %out.first().map(|x| x.name.as_str()).unwrap_or("na"), + "parsed {} crate versions from metadata file", out.len() + ); + + Ok(out) + } })) .buffer_unordered(num_cpus::get()) .collect() .await; + let n_existing = n_existing.load(Ordering::Relaxed); + + if n_existing > 0 { + warn!( + "skipped {} crate versions that were previously downloaded", + n_existing, + ); + } + let crate_versions: Vec = crate_versions .into_iter() .flat_map(|result| match result { @@ -327,6 +454,10 @@ async fn get_crate_versions(clone_dir: &Path) -> Result, AnyEr .collect(); info!( + n_files, + n_excl, + n_existing, + n_download_targets = crate_versions.len(), "collected {} total crate versions to download", crate_versions.len() ); @@ -392,20 +523,35 @@ async fn download_versions( .user_agent(&config.http.user_agent) .build()?; + info!( + reqs_per_sec = config.http.requests_per_second, + max_concurrency = config.http.max_concurrent_requests, + "downloading crates at {} reqs/sec", + config.http.requests_per_second, + ); + let inner_stream = futures::stream::iter(versions.into_iter().map(|vers| { let req_begin = Instant::now(); let http_client = http_client.clone(); + async move { // TODO actually parse and use the format let vers_path = format!("{}/{}/download", vers.name, vers.vers); let url = format!("{}/{}", registry_config.dl, vers_path); + let output_path = config.output.path.join(vers_path); + + if config.dry_run { + debug!(%url, "skipping download (--dry-run mode)"); + return Ok(None); + } + debug!(?url, "downloading..."); let req = http_client .get(url) .header(CONTENT_TYPE, "application/json") .header(ACCEPT, "application/json"); - let req = if let Some(token) = config.auth_token.as_deref() { + let req = if let Some(token) = config.registry.auth_token.as_deref() { req.header(AUTHORIZATION, token) } else { req @@ -421,7 +567,6 @@ async fn download_versions( Err::<_, AnyError>(format!("error response {:?} from server", status).into()) } else { // TODO: check if this path exists already before downloading - let output_path = config.output.path.join(vers_path); ensure_file_parent_dir_exists(&output_path) .await .map_err(|e| { @@ -440,7 +585,7 @@ async fn download_versions( version = %vers.vers, "downloaded .crate file in {:?}", req_begin.elapsed()); debug!(?output_path, "wrote {} bytes to file", body.len()); - Ok(output_path) + Ok(Some(output_path)) } } })) @@ -448,24 +593,32 @@ async fn download_versions( let outer_stream = inner_stream.ratelimit_stream(&rate_limit); - let results: Vec> = outer_stream.collect().await; + let results: Vec, AnyError>> = outer_stream.collect().await; let mut ret = Ok(()); let n = results.len(); let mut n_err = 0; + let mut n_skip = 0; for result in results { - if let Err(e) = result { - n_err += 1; - error!(err = ?e, "download failed"); - ret = Err(e); + match result { + Ok(None) => n_skip += 1, + + Err(e) => { + n_err += 1; + error!(err = ?e, "download failed"); + ret = Err(e); + } + + _ => {} } } - let n_ok = n - n_err; + let n_ok = n - n_err - n_skip; info!( n_ok, n_err, + n_skip, "finished downloading {} files in {:?}", n_ok, begin.elapsed() @@ -479,13 +632,16 @@ async fn run(config: Config) -> Result<(), AnyError> { debug!("config:\n{:#?}\n", config); assert!( - config.index_url.is_some() || config.index_path.is_some(), + config.registry.index_url.is_some() || config.registry.index_path.is_some(), "one of index-url or index-path is required", ); + // verify regex compiles + let _ = config.compile_filter()?; + let tmpdir = tempdir::TempDir::new("registry-backup-index")?; - let index_path = match (&config.index_url, &config.index_path) { + let index_path = match (&config.registry.index_url, &config.registry.index_path) { (Some(url), _) => { let tmp = tmpdir.path(); git_clone(url, tmp, &[][..]).await?; @@ -499,7 +655,7 @@ async fn run(config: Config) -> Result<(), AnyError> { let registry_config = load_registry_config(index_path).await?; - let versions = get_crate_versions(index_path).await?; + let versions = get_crate_versions(&config, index_path).await?; download_versions(&config, ®istry_config, versions).await?;