#![allow(unused_labels)] use std::path::{Path, PathBuf}; use std::collections::{BTreeMap, HashMap}; use std::io::prelude::*; use std::time::*; use serde::{Serialize, Deserialize}; use clap::Parser; use tracing::{debug, error, info, trace, warn}; use tracing_subscriber::filter::EnvFilter; use url::Url; use anyhow::{anyhow, bail, Error, Context}; use semver::Version; use futures::stream::StreamExt; use tokio::io::AsyncBufReadExt; use reqwest::header::AUTHORIZATION; use tempfile::TempDir; use rayon::prelude::*; use petgraph::stable_graph::StableGraph; use petgraph::visit::{Bfs, EdgeRef, Topo, Walker}; use petgraph::graph::NodeIndex; #[derive(Parser, Debug)] #[clap(author, version, global_setting(clap::AppSettings::DeriveDisplayOrder))] struct Opt { /// Config file with source directories and destination registry info #[clap(short, long, value_name = "PATH")] pub config_file: PathBuf, /// Perform all the work of generating `cargo publish` payloads, /// but don't send them to the destination registry server #[clap(long)] pub dry_run: bool, /// Load config file, validate the settings, and display the final loaded content /// to stdout, then exit #[clap(long)] pub validate: bool, /// Use to limit which crates from the source registry are published to the /// destination registry. Expects a regular expression which will be matched /// against the names of crates. Only crates with names that match the regex /// will be published. This field may also be specified at the top level of /// the config file. #[clap(long, value_name = "REGEX", alias = "filter")] pub filter_crates: Option, } #[derive(Debug, Clone, Deserialize)] #[serde(rename_all = "kebab-case")] pub struct DestinationRegistryConfig { #[serde(alias = "api")] pub api_url: Url, #[serde(alias = "token")] pub auth_token: String, } #[derive(Debug, Clone, Deserialize)] #[serde(rename_all = "kebab-case")] pub struct SourceRegistryConfig { #[serde(alias = "index")] pub index_dir: PathBuf, #[serde(alias = "crate-files")] pub crate_files_dir: PathBuf, } #[derive(Deserialize, Debug, Clone)] #[serde(rename_all = "kebab-case")] pub struct HttpConfig { /// Value of user-agent HTTP header #[serde(default = "default_user_agent")] pub user_agent: String, } const DEFAULT_USER_AGENT: &str = concat!("shipyard.rs-publish-tool/v", env!("CARGO_PKG_VERSION")); fn default_user_agent() -> String { DEFAULT_USER_AGENT.to_string() } impl Default for HttpConfig { fn default() -> Self { Self { user_agent: default_user_agent(), } } } #[derive(Debug, Clone, Deserialize)] #[serde(rename_all = "kebab-case")] pub struct Config { /// Do everything except actually publish to the destination registry. Can also be /// toggled using the --dry-run command line flag. #[serde(default)] pub dry_run: bool, /// Local directories with source registry files #[serde(alias = "source")] pub src: SourceRegistryConfig, /// Server information and authentication needed to publish to the /// destination registry #[serde(alias = "destination")] pub dst: DestinationRegistryConfig, /// Settings controlling the HTTP publish requests to the destination registry #[serde(default)] pub http: HttpConfig, /// Use to limit which crates from the source registry are published to the /// destination registry. Expects a regular expression which will be matched /// against the names of crates. Only crates with names that match the regex /// will be published. #[serde(default, alias = "filter")] pub filter_crates: Option, } impl Config { pub fn compile_filter(&self) -> Result, Error> { match self.filter_crates.as_ref() { Some(regex) => { let compiled = regex::Regex::new(regex).map_err(|e| { error!(%regex, err = ?e, "regex failed to compile: {}", e); e })?; Ok(Some(compiled)) } None => Ok(None), } } } /// fields we need from Cargo.toml [package] section to combine with IndexMeta /// to form a PublishMeta. #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct PackageStub { pub name: String, pub version: Version, #[serde(default)] pub authors: Vec, pub description: Option, pub license: Option, pub license_file: Option, #[serde(default)] pub categories: Vec, #[serde(default)] pub keywords: Vec, pub readme: Option, pub repository: Option, pub homepage: Option, pub documentation: Option, pub links: Option, } /// Example from post-cargo publish Cargo.toml /// ```toml,ignore /// [dependencies.docyard] /// version = "0.31.0" /// registry-index = "ssh://git@ssh.shipyard.rs/shipyard-rs/crate-index.git" /// features = [ /// "auth", /// "storage", /// ] /// default-features = false /// ``` #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct GeneratedManifestDependency { #[serde(rename = "registry-index")] pub registry_index: Option, } /// for parsing Cargo.toml to extract missing PublishMeta fields that do not appear /// in IndexMeta #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct ManifestStub { pub package: PackageStub, // TODO: parse + modify the generated Cargo.toml (not Cargo.toml.original) // to rewrite the `registry-index` fields. // // we will also need to recompute the cksum } /// full definition of cargo publish json #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct PublishMeta { pub name: String, #[serde(alias = "version")] pub vers: semver::Version, #[serde(alias = "dependencies")] #[serde(default)] pub deps: Vec, #[serde(default)] pub features: BTreeMap>, #[serde(default)] pub authors: Vec, pub description: Option, pub documentation: Option, pub homepage: Option, pub readme: Option, pub readme_file: Option, #[serde(default)] pub keywords: Vec, #[serde(default)] pub categories: Vec, pub license: Option, pub license_file: Option, pub repository: Option, #[serde(skip_serializing_if = "Option::is_none")] pub links: Option, #[serde(skip_serializing_if = "Option::is_none")] pub badges: Option>, /// from ancient cargo versions #[serde(skip_serializing_if = "Option::is_none")] pub features2: Option>>, /// from ancient cargo versions #[serde(skip_serializing_if = "Option::is_none")] pub v: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct PublishDependency { pub optional: bool, pub default_features: bool, pub name: String, pub features: Vec, // cargo and crates-io have this as string #[serde(alias = "req")] pub version_req: semver::VersionReq, pub target: Option, // crates-io has this as option pub kind: DependencyKind, #[serde(skip_serializing_if = "Option::is_none")] pub registry: Option, #[serde(skip_serializing_if = "Option::is_none")] pub explicit_name_in_toml: Option, } impl From for PublishDependency { fn from(dep: IndexDependency) -> Self { Self { name: dep.name, features: dep.features, default_features: dep.default_features, optional: dep.optional, target: dep.target, kind: dep.kind, registry: dep.registry, version_req: dep.req, explicit_name_in_toml: dep.package, } } } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct IndexMeta { // same everything as publish metadata pub name: String, #[serde(alias = "version")] pub vers: semver::Version, #[serde(alias = "dependencies")] pub features: BTreeMap>, #[serde(skip_serializing_if = "Option::is_none")] pub links: Option, #[serde(skip_serializing_if = "Option::is_none")] pub badges: Option>, // modified format/field names pub deps: Vec, // fields that don't appear in publish metadata pub cksum: String, pub yanked: bool, // ancient fields, these were actually written // on sanskrit on stone tablets #[serde(skip_serializing_if = "Option::is_none")] pub features2: Option>>, #[serde(skip_serializing_if = "Option::is_none")] pub v: Option, } #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct IndexDependency { /// corresponds to `explicit_name_in_toml` field in `publish::Dependency` /// when a dep is renamed in Cargo.toml, otherwise same as `package`. pub name: String, /// corresponds to `name` in `publish::Dependency` #[serde(skip_serializing_if = "Option::is_none")] pub package: Option, /// in publish meta, this field is called `version_req`, and the index /// format requires it to be renamed to `req` #[serde(alias = "version_req")] pub req: semver::VersionReq, pub features: Vec, pub optional: bool, pub default_features: bool, pub target: Option, pub kind: DependencyKind, #[serde(skip_serializing_if = "Option::is_none")] pub registry: Option, } /// Section in which this dependency was defined #[derive(Copy, Clone, Serialize, Deserialize, Debug, PartialEq, PartialOrd, Ord, Eq, Hash)] #[serde(rename_all = "lowercase")] pub enum DependencyKind { /// Used at run time Normal, /// Used at build time, not available at run time Build, /// Not fetched and not used, except for when used direclty in a workspace Dev, } impl PublishMeta { pub fn new( index_meta: IndexMeta, manifest: ManifestStub, readme: Option, ) -> Self { let ManifestStub { package } = manifest; PublishMeta { name: package.name, vers: package.version, deps: index_meta.deps.into_iter().map(From::from).collect(), features: index_meta.features, authors: package.authors, description: package.description, documentation: package.documentation, homepage: package.homepage, readme, readme_file: package.readme, keywords: package.keywords, categories: package.categories, license: package.license, license_file: package.license_file, repository: package.repository, links: package.links, badges: index_meta.badges, features2: index_meta.features2, v: index_meta.v, } } } fn serialize_publish_payload( publish_meta_json: &[u8], dot_crate_bytes: &[u8], ) -> Vec { assert!(publish_meta_json.len() <= u32::MAX as usize); assert!(dot_crate_bytes.len() <= u32::MAX as usize); let mut out: Vec = Vec::with_capacity( publish_meta_json.len() + dot_crate_bytes.len() + 8 // 2x u32 lengths ); out.extend_from_slice(&(publish_meta_json.len() as u32).to_le_bytes()[..]); out.extend_from_slice(publish_meta_json); out.extend_from_slice(&(dot_crate_bytes.len() as u32).to_le_bytes()[..]); out.extend_from_slice(dot_crate_bytes); out } fn extract_manifest_files_from_tar(rdr: R) -> Result { let mut archive = tar::Archive::new(rdr); let mut cargo_toml = None; let mut cargo_toml_orig = None; let mut cargo_lock = None; for entry in archive.entries()? { let mut entry = entry?; let path = entry.path()?; if path.ends_with("Cargo.toml.orig") { let mut data = String::new(); entry.read_to_string(&mut data)?; cargo_toml_orig = Some(data); } else if path.ends_with("Cargo.toml") { let mut data = String::new(); entry.read_to_string(&mut data)?; cargo_toml = Some(data); } else if path.ends_with("Cargo.lock") { let mut data = String::new(); entry.read_to_string(&mut data)?; cargo_lock = Some(data); } if cargo_toml.is_some() && cargo_toml_orig.is_some() && cargo_lock.is_some() { break } } if !(cargo_toml.is_some() && cargo_toml_orig.is_some()) { anyhow::bail!("some required manifest files missing in .crate archive \ (cargo_toml={:?} cargo_toml_orig={:?} cargo_lock={:?})", cargo_toml.is_some(), cargo_toml_orig.is_some(), cargo_lock.is_some(), ); } Ok(ManifestFiles { cargo_toml: cargo_toml.unwrap(), cargo_toml_orig: cargo_toml_orig.unwrap(), cargo_lock, }) } fn extract_readme_from_tar(rdr: R, readme_path: &Path) -> Result, Error> { let mut archive = tar::Archive::new(rdr); for entry in archive.entries()? { let mut entry = entry?; let path = entry.path()?; if path == readme_path || path.ends_with(readme_path) { let mut out = String::new(); entry.read_to_string(&mut out)?; return Ok(Some(out)) } } Ok(None) } fn setup_logger() { let env_filter = EnvFilter::from_default_env(); let builder = tracing_subscriber::fmt() .with_env_filter(env_filter) .with_ansi(true); builder.init(); } fn load_config_file(opt: &Opt) -> Result { if !opt.config_file.exists() { bail!("path does not exist: {:?}", opt.config_file); } let toml = std::fs::read_to_string(&opt.config_file)?; let mut config: Config = toml::from_str(&toml) .context("read config file, but unable to parse toml - check \ format against example config")?; // augment using command line opts config.filter_crates = config.filter_crates.or_else(|| opt.filter_crates.clone()); config.dry_run |= opt.dry_run; Ok(config) } fn is_hidden(entry: &walkdir::DirEntry) -> bool { entry .file_name() .to_str() .map(|s| s.starts_with('.')) .unwrap_or(false) } async fn get_index_metas( config: &Config, ) -> Result>, Error> { let filter = config.compile_filter()?; let mut n_excl = 0; let files: Vec<(String, PathBuf)> = walkdir::WalkDir::new(&config.src.index_dir) .max_depth(3) .into_iter() .filter_entry(|e| !is_hidden(e)) .filter_map(|res| match res { Ok(entry) => { if entry.file_type().is_file() && entry.depth() >= 2 && entry.depth() <= 3 { let path = entry.into_path(); let crate_name: &str = path.file_name().and_then(|x| x.to_str()).unwrap_or(""); if let Some(filter) = filter.as_ref() { if !filter.is_match(crate_name.as_ref()) { trace!(%crate_name, "crate excluded by filter"); n_excl += 1; return None; } } debug!(?path, "found crate index metadata file to parse"); Some((crate_name.to_owned(), path)) } else { None } } Err(e) => { warn!(error = ?e, "walkdir result is error"); None } }) .collect(); let n_files = files.len(); info!("found {} crate index metadata files to parse", n_files); if n_excl > 0 { warn!( regex = %config.filter_crates.as_deref().unwrap_or(""), n_files, n_excl, "regex filter (--filter-crates) excluded {} crates", n_excl, ); } let crate_versions: Vec), Error>> = futures::stream::iter(files.into_iter().map(|(crate_name, path)| { async move { let file = tokio::fs::File::open(&path).await.map_err(|e| { error!(err = ?e, ?path, "failed to open file"); e })?; let buf = tokio::io::BufReader::new(file); let mut out = Vec::new(); let mut lines = buf.lines(); 'lines: while let Some(line) = lines.next_line().await? { let index_meta: IndexMeta = serde_json::from_str(&line) .map_err(|e| { error!(err = ?e, ?path, "failed to parse line"); e })?; out.push(index_meta); } debug!(crate_name = %out.first().map(|x| x.name.as_str()).unwrap_or("na"), "parsed {} crate versions from metadata file", out.len() ); Ok((crate_name, out)) } })) .buffer_unordered(num_cpus::get()) .collect() .await; let mut total_number_of_crate_versions = 0; // map of crate-name => [IndexMeta] (one per published version) let crate_versions: HashMap> = crate_versions .into_iter() .filter_map(|result| match result { Ok((crate_name, xs)) => { total_number_of_crate_versions += xs.len(); Some((crate_name, xs)) } Err(e) => { error!(err = ?e, "parsing metadata failed, skipping file"); None } }) .collect(); info!( n_files, n_excl, n_crates = crate_versions.len(), total_number_of_crate_versions, "parsed {} crate version metadata entries from index", total_number_of_crate_versions, ); Ok(crate_versions) } #[derive(Debug, Clone, Deserialize, Eq, PartialEq, Default)] pub struct PublishWarnings { #[serde(default)] pub invalid_categories: Vec, #[serde(default)] pub invalid_badges: Vec, #[serde(default)] pub other: Vec, } #[derive(Debug, Clone, Deserialize, Eq, PartialEq, Default)] pub struct PublishResponse { #[serde(default)] pub warnings: PublishWarnings, } struct ManifestFiles { cargo_toml: String, cargo_toml_orig: String, cargo_lock: Option, } struct VersionMeta { index_meta: IndexMeta, manifest_files: ManifestFiles, dot_crate_path: PathBuf, manifest: ManifestStub, readme: Option, tmp: TempDir, meta: cargo_metadata::Metadata, } #[derive(Debug, Clone, PartialEq, Eq, Hash)] struct Node<'a> { name: &'a str, vers: Version, } // async fn process_crates( // config: &Config, // crate_versions: HashMap>, // ) -> Result<(), Error> { // let http_client = reqwest::Client::builder() // .user_agent(&config.http.user_agent) // .build()?; // // let publish_url = config.dst.api_url.join("/api/v1/crates/new")?; // let publish_meta = PublishMeta::new(index_meta, manifest, readme); // // debug!("built publish meta using crate index json and the Cargo.toml manifest in the .crate targz archive:\n{:#?}\n", publish_meta); // // let publish_meta_json = serde_json::to_vec(&publish_meta)?; // let payload = serialize_publish_payload(&publish_meta_json, &dot_crate_bytes); // debug!( // n_bytes = payload.len(), // %crate_name, // %version, // "serialized publish payload", // ); // // if config.dry_run { // debug!( // %crate_name, // %version, // %publish_url, // "skipping publish (--dry-run mode)", // ); // continue; // } // // let resp = http_client.put(publish_url.clone()) // .header(AUTHORIZATION, &config.dst.auth_token) // .body(payload) // .send() // .await?; // // debug!(status = ?resp.status(), "rcvd server response to publish request"); // // let resp: PublishResponse = resp // .error_for_status()? // .json() // .await?; // let PublishResponse { warnings } = resp; // // let mut any_warnings = false; // // for warning in warnings.invalid_categories.iter() { // warn!(%crate_name, %version, "registry server invalid category warning: {}", warning); // any_warnings = true; // } // // for warning in warnings.invalid_badges.iter() { // warn!(%crate_name, %version, "registry server invalid badge warning: {}", warning); // any_warnings = true; // } // // for warning in warnings.other.iter() { // warn!(%crate_name, %version, "registry server 'other' warning: {}", warning); // any_warnings = true; // } // // trace!("server response body:\n{warnings:#?}"); // // info!( // %crate_name, // %version, // any_warnings, // "published crate version in {:?}!", // begin.elapsed(), // ); fn parse_manifests( config: &Config, crate_versions: HashMap>, ) -> Result>, Error> { let begin = Instant::now(); let out: HashMap> = crate_versions // .into_par_iter() .into_iter() .map(|(crate_name, versions)| -> Result<(String, Vec), Error> { let begin = Instant::now(); let mut version_metas = Vec::new(); for index_meta in versions { let version = index_meta.vers.clone(); trace!(%crate_name, %version, "processing crate version"); let dot_crate_path = config.src.crate_files_dir .join(&format!("{}/{}/download", crate_name, index_meta.vers)); verify_file_exists(&dot_crate_path)?; trace!(path = ?dot_crate_path, "reading .crate file"); let dot_crate_bytes = std::fs::read(&dot_crate_path) .with_context(|| { format!("failed to read .crate file for \ {crate_name} v{0} with path {dot_crate_path:?}", index_meta.vers, ) })?; trace!("extracting Cargo.toml from .crate targz archive"); let decoder = flate2::read::GzDecoder::new(&dot_crate_bytes[..]); let manifest_files = extract_manifest_files_from_tar(decoder)?; if manifest_files.cargo_lock.is_none() { debug!(%crate_name, %version, "Cargo.lock not present in .crate archive"); } let manifest: ManifestStub = toml::from_str(&manifest_files.cargo_toml)?; let mut readme: Option = None; if let Some(readme_path) = manifest.package.readme.as_ref() { let decoder = flate2::read::GzDecoder::new(&dot_crate_bytes[..]); if let Some(readme_content) = extract_readme_from_tar(decoder, readme_path)? { trace!(length = readme_content.len(), "extracted readme file content from .crate targz archive"); readme = Some(readme_content); } } let tmp = TempDir::new()?; let decoder = flate2::read::GzDecoder::new(&dot_crate_bytes[..]); tar::Archive::new(decoder).unpack(tmp.path())?; trace!(tmpdir = ?tmp.path(), "unpacked .crate archive to temp dir"); let target_dir = tmp.path().join("target"); std::fs::create_dir(&target_dir)?; let meta = cargo_metadata::MetadataCommand::new() .manifest_path(tmp.path().join(&format!("{crate_name}-{version}/Cargo.toml"))) //.env("CARGO_TARGET_DIR", &target_dir) .other_options(vec!["-vv".to_string()]) .verbose(true) // .other_options(["--frozen"].into_iter().map(|x| x.to_owned()).collect::>()) .exec()?; version_metas.push(VersionMeta { index_meta, manifest_files, dot_crate_path, manifest, readme, tmp, meta, }); } debug!(%crate_name, "parsed {} manifests in {:?}", version_metas.len(), begin.elapsed()); Ok((crate_name, version_metas)) }).collect::>()?; info!("parsed crate version manifests in {:?}", begin.elapsed()); Ok(out) } // fn get_registry_dep<'a>(index_dep: &'a IndexDependency) -> Option> { // match index_dep.registry.as_ref() { // None => Some(Node { name: index_dep.name.as_str(), vers: index_dep.vers.clone() }), // Some(index) if index.contains("github.com/rust-lang/crates.io-index") => None, // Some(other) => panic!("unexpected registry value: {}", other), // } // } // fn build_dependency_graph<'a>( // crate_versions: &'a HashMap>, // ) -> (StableGraph, ()>, HashMap, NodeIndex>) { // let begin = Instant::now(); // // let mut graph = StableGraph::new(); // let mut index: HashMap, NodeIndex> = Default::default(); // // macro_rules! get_ix { // ($node:expr) => {{ // let key_exists = ix.contains_key($node); // if !key_exists { // let ix = graph.add_node($node.clnoe()); // index.insert(node.clone(), ix); // ix // } else { // *index[$node] // } // }} // } // // for (name, versions) in crate_versions.iter() { // for version_meta in versions.iter() { // let v = &version_meta.index_meta; // let node = Node { name: name.as_str(), vers: v.vers.clone() }; // let key_exists = ix.contains_key(&node); // let ix = get_ix!(&node); // for dep_node in v.deps.iter().filter_map(get_registry_dep) { // let jx = get_ix!(dep_node); // graph.add_edge(ix, js, ()); // } // } // } // // info!( // n_nodes = graph.node_count(), // n_edges = graph.edge_count(), // "built dependency graph for entire registry in {:?}", begin.elapsed(), // ); // // (graph, index) // } async fn verify_dir_exists>(path: P) -> Result<(), Error> { match tokio::fs::metadata(path.as_ref()).await { Ok(meta) if meta.is_dir() => Ok(()), Ok(meta) /* if ! meta.is_dir() */ => { debug_assert!( ! meta.is_dir()); bail!("path exists, but is not a directory: {:?}", path.as_ref()) } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { bail!("path does not exist: {}", path.as_ref().display()); } Err(e) => Err(e.into()), } } fn verify_file_exists>(path: P) -> Result<(), Error> { match std::fs::metadata(path.as_ref()) { Ok(meta) if meta.is_file() => Ok(()), Ok(meta) /* if ! meta.is_file() */ => { debug_assert!( ! meta.is_file()); bail!("path exists, but is not a file: {:?}", path.as_ref()) } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { bail!("path does not exist: {}", path.as_ref().display()); } Err(e) => Err(e.into()), } } fn main() -> Result<(), Error> { let begin = Instant::now(); dotenvy::dotenv().ok(); let opt = Opt::parse(); setup_logger(); let config = load_config_file(&opt)?; let rt = tokio::runtime::Runtime::new()?; rt.block_on(verify_dir_exists(&config.src.index_dir))?; rt.block_on(verify_dir_exists(&config.src.crate_files_dir))?; if opt.validate { println!("{:#?}", config); return Ok(()) } let krates = rt.block_on(get_index_metas(&config))?; let manifests = parse_manifests(&config, krates)?; // let (graph, ix) = build_dependency_graph(&manifests); drop(manifests); drop(rt); info!("finished in {:?}", begin.elapsed()); Ok(()) }