From eb111aad339c31ec4639eb9e3193a621e0c0b898 Mon Sep 17 00:00:00 2001 From: Jonathan Strong Date: Thu, 15 Sep 2022 15:49:30 -0400 Subject: [PATCH] implement url templates in addition to url templates, this also includes special handling for crates.io: crates.io index config.json dl: "https://crates.io/api/v1/crates" request to "https://crates.io/api/v1/crates/{crate}/{vers}/download" will return a 302 redirect to "https://static.crates.io/crates/{crate}/{crate}-{vers}.crate". rather than follow the 302, this detects if the dl value is crates.io (`is_crates_io` method) and performs the redirect preemptively. in discussion with crates.io team on discord, @carol10cents indicated this would avoid need to throttle requests as they would go right to cdn instead of hitting crates.io webserver. --- Cargo.lock | 3 +- Cargo.toml | 3 +- src/main.rs | 116 +++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 109 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cff21fa..e1e8ae9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1323,7 +1323,7 @@ checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" [[package]] name = "registry-backup" -version = "0.3.0" +version = "0.4.0-rc.1" dependencies = [ "chrono", "clap", @@ -1341,6 +1341,7 @@ dependencies = [ "toml", "tracing", "tracing-subscriber", + "url", "walkdir", ] diff --git a/Cargo.toml b/Cargo.toml index 8869e78..771108e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "registry-backup" authors = ["Jonathan Strong "] -version = "0.3.0" +version = "0.4.0-rc.1" edition = "2021" publish = ["shipyard-rs-public"] readme = "README.md" @@ -38,6 +38,7 @@ pretty_toa = "1" tera = { version = "1", optional = true } chrono = { version = "0.4", optional = true } regex = "1.6" +url = "2" [features] default = [] diff --git a/src/main.rs b/src/main.rs index 7fac007..1a653cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,7 +12,7 @@ use clap::Parser; use futures::stream::StreamExt; use governor::prelude::*; use governor::{Quota, RateLimiter}; -use reqwest::header::{ACCEPT, AUTHORIZATION, CONTENT_TYPE}; +use reqwest::header::AUTHORIZATION; // ACCEPT, CONTENT_TYPE}; use serde::Deserialize; use tokio::io::AsyncBufReadExt; use tracing::{debug, error, info, warn}; @@ -20,6 +20,9 @@ use tracing_subscriber::filter::EnvFilter; type AnyError = Box; +// const CRATESIO_INDEX: &str = "https://github.com/rust-lang/crates.io-index.git"; +const CRATESIO_DL_URL: &str = "https://crates.io/api/v1/crates"; + /// type representing the schema of the config.json file /// placed at the root of the crate index repo. /// @@ -43,11 +46,62 @@ pub struct RegistryConfig { pub auth_required: Option, } +impl RegistryConfig { + pub fn is_crates_io(&self) -> bool { + self.dl == CRATESIO_DL_URL + } + + pub fn get_dl_url(&self, name: &str, version: &str, cksum: &str) -> String { + const TEMPLATE_KEYS: [&str; 5] = [ + "{crate}", + "{version}", + "{prefix}", + "{lowerprefix}", + "{sha256-checksum}", + ]; + + if self.is_crates_io() { + // instead of following 302 redirect from /api endpoint, just preemptively + // get the static cdn url + format!( + "https://static.crates.io/crates/{name}/{name}-{version}.crate", + name = name, + version = version, + ) + } else if TEMPLATE_KEYS.iter().any(|k| self.dl.contains(k)) { + let mut out = self.dl.clone(); + + if self.dl.contains("{prefix}") { + let prefix = relative_index_file_helper(name).join("/"); + out = out.replace("{prefix}", &prefix); + } + + if self.dl.contains("{lowerprefix}") { + let prefix = relative_index_file_helper(&name.to_lowercase()).join("/"); + out = out.replace("{lowerprefix}", &prefix); + } + + out = out.replace("{crate}", name); + out = out.replace("{version}", version); + out = out.replace("{sha256-checksum}", cksum); + out + } else { + format!( + "{dl}/{name}/{version}/download", + dl = self.dl, + name = name, + version = version, + ) + } + } +} + /// One version per line in the index metadata files. #[derive(Debug, Clone, Deserialize)] pub struct CrateVersion { pub name: String, pub vers: String, + pub cksum: String, } /// Configuration for where to save the downloaded .crate files, and @@ -175,8 +229,6 @@ pub struct Config { const DEFAULT_OUTPUT_PATH: &str = "output"; const DEFAULT_USER_AGENT: &str = concat!("registry-backup/v", clap::crate_version!()); -//const CRATESIO_INDEX: &str = "https://github.com/rust-lang/crates.io-index.git"; - fn default_output_path() -> PathBuf { PathBuf::from(DEFAULT_OUTPUT_PATH) } @@ -487,6 +539,37 @@ async fn ensure_file_parent_dir_exists>(path: P) -> Re } } +// relative_index_* fns taken from rust-lang/crates.io source code + +/// Returns the relative path to the crate index file. +/// Does not perform conversion to lowercase. +fn relative_index_file_helper(name: &str) -> Vec<&str> { + match name.len() { + 1 => vec!["1", name], + 2 => vec!["2", name], + 3 => vec!["3", &name[..1], name], + _ => vec![&name[0..2], &name[2..4], name], + } +} + +// /// Returns the relative path to the crate index file that corresponds to +// /// the given crate name as a path (i.e. with platform-dependent folder separators). +// /// +// /// see +// fn relative_index_file(name: &str) -> PathBuf { +// let name = name.to_lowercase(); +// Self::relative_index_file_helper(&name).iter().collect() +// } +// +// /// Returns the relative path to the crate index file that corresponds to +// /// the given crate name for usage in URLs (i.e. with `/` separator). +// /// +// /// see +// fn relative_index_file_for_url(name: &str) -> String { +// let name = name.to_lowercase(); +// Self::relative_index_file_helper(&name).join("/") +// } + macro_rules! megabytes { ($x:expr) => {{ use pretty_toa::ThousandsSep; @@ -532,9 +615,9 @@ async fn download_versions( async move { // TODO actually parse and use the format - let vers_path = format!("{}/{}/download", vers.name, vers.vers); - let url = format!("{}/{}", registry_config.dl, vers_path); - let output_path = config.output.path.join(vers_path); + let url = + url::Url::parse(®istry_config.get_dl_url(&vers.name, &vers.vers, &vers.cksum))?; + let output_path = config.output.path.join(url.path()); if config.dry_run { debug!(%url, "skipping download (--dry-run mode)"); @@ -542,11 +625,7 @@ async fn download_versions( } debug!(?url, "downloading..."); - let req = http_client - .get(url) - //.header(CONTENT_TYPE, "application/json") - //.header(ACCEPT, "application/json") - ; + let req = http_client.get(url); let req = if let Some(token) = config.registry.auth_token.as_deref() { req.header(AUTHORIZATION, token) @@ -687,4 +766,19 @@ mod tests { const TOML: &str = include_str!("../config.toml.sample"); let _config: Config = toml::from_str(TOML).unwrap(); } + + #[test] + fn sanity_check_url_template_rendering() { + let config = RegistryConfig { + dl: "{prefix}__{lowerprefix}__{sha256-checksum}__{crate}__{version}.tar.gz".to_string(), + api: String::new(), + allowed_registries: vec![], + auth_required: Some(true), + }; + + assert_eq!( + config.get_dl_url("iM-14yo-LoL", "0.69.42-rc.123", "c5b6fc73"), + "iM/-1/iM-14yo-LoL__im/-1/im-14yo-lol__c5b6fc73__iM-14yo-LoL__0.69.42-rc.123.tar.gz", + ); + } }