Compare commits

...

26 Commits

Author SHA1 Message Date
Jonathan Strong 2c83cd14c5 add timeouts to download request 11 months ago
Jonathan Strong 69afa9a804 adds --skip and --limit options 1 year ago
Jonathan Strong 25ade0806b fix it so when you run in --dry-mode, it doesn't start publishing crates (!) 1 year ago
Jonathan Strong c890c5f429 re-generate docs 1 year ago
Jonathan Strong 4c2a9e5fc2 rustfmt 1 year ago
Jonathan Strong 7e9c5ec9bd silence clippy 1 year ago
Jonathan Strong 2ad520d230 silence clippy 1 year ago
Jonathan Strong a9c8906096 rustfmt 1 year ago
Jonathan Strong dd4eb957ad document `publish` 1 year ago
Jonathan Strong b717fc952b clean up code 1 year ago
Jonathan Strong 934559fd41 clean up code, removed unused stuff, etc. 1 year ago
Jonathan Strong de6c86115b fix editing of deps in manifest 1 year ago
Jonathan Strong 5d3ea67dfc first attempt at dealing with conditional keys - correct solution will use toml_edit::visit_mut::VisitMut 1 year ago
Jonathan Strong 9545b350a4 additional logging 1 year ago
Jonathan Strong 24bbd17e82 bugfixes, etc. 1 year ago
Jonathan Strong 1afd7b465f add check that index-dir exists 1 year ago
Jonathan Strong 4a12c2802c remove hard-coded crate name filter erroneously left in from debugging 1 year ago
Jonathan Strong c2f8d60922 working rough cut using publish log csv and cargo publish command 1 year ago
Jonathan Strong cc275bacb7 hit a bit of a dead end for now 1 year ago
Jonathan Strong fa147a8ee4 easier solution (?): python script to extract which crates were published in which order from the index repo 1 year ago
Jonathan Strong 7ae5ab55d5 properly handle the server's publish response 1 year ago
Jonathan Strong b02731ecd6 working first iteration for evaluation 1 year ago
Jonathan Strong 8b57bf70b0 handle errors better in main 1 year ago
Jonathan Strong 315c27b25b set `dry_run` to `true` in `Config` if passed as command line flag 1 year ago
Jonathan Strong 8bb8d218c7 load .env file vars 1 year ago
Jonathan Strong 3cbe8997a0 begin work on mass publish tool - scoping out how to build publish meta from index meta + Cargo.toml 1 year ago
  1. 2
      .gitignore
  2. 1267
      Cargo.lock
  3. 32
      Cargo.toml
  4. 275
      README.md
  5. 207
      doc/README.tera.md
  6. 4
      doc/cli-menu.txt
  7. 2
      doc/just-commands.txt
  8. 19
      doc/publish-cli-menu.txt
  9. 10
      justfile
  10. 23
      publish-config.toml.sample
  11. 92
      script/get-publish-history.py
  12. 4
      src/generate-readme.rs
  13. 53
      src/main.rs
  14. 896
      src/publish.rs

2
.gitignore vendored

@ -1,4 +1,6 @@
/target
*.swp
config.toml
publish-config.toml
/output
.env

1267
Cargo.lock generated

File diff suppressed because it is too large Load Diff

32
Cargo.toml

@ -1,9 +1,9 @@
[package]
name = "registry-backup"
authors = ["Jonathan Strong <jstrong@shipyard.rs>"]
version = "0.4.1"
version = "0.5.0-beta.1"
edition = "2021"
publish = ["shipyard-rs-public"]
#publish = ["shipyard-rs-public"]
readme = "README.md"
repository = "https://git.shipyard.rs/jstrong/registry-backup"
homepage = "https://git.shipyard.rs/jstrong/registry-backup"
@ -15,6 +15,10 @@ description = "CLI tool for backup/export of .crate files from a registry server
name = "registry-backup"
path = "src/main.rs"
[[bin]]
name = "publish"
path = "src/publish.rs"
[[bin]]
name = "generate-readme"
path = "src/generate-readme.rs"
@ -36,10 +40,28 @@ num_cpus = "1.3"
governor = "0.4.2"
pretty_toa = "1"
tera = { version = "1", optional = true }
chrono = { version = "0.4", optional = true }
chrono = { version = "0.4.22, < 0.4.23", features = ["serde"], optional = true }
regex = "1.6"
url = "2"
url = { version = "2", features = ["serde"] }
semver = { version = "1", features = ["serde"] }
tar = "0.4.38"
anyhow = "1"
dotenvy = "0.15"
flate2 = "1"
tempfile = { version = "3.8.1", optional = true }
rayon = { version = "1.8", optional = true }
csv = { version = "1", optional = true }
convert_case = { version = "0.6", optional = true }
toml_edit = { version = "0.21", optional = true }
[features]
default = []
default = ["publish"]
docs = ["tera", "chrono"]
publish = [
"csv",
"chrono",
"tempfile",
"rayon",
"convert_case",
"toml_edit",
]

275
README.md

@ -1,13 +1,25 @@
# registry-backup
A command line utility for downloading all .crate files hosted by a Cargo registry server.
Command line utilities for backup, export, and migration of a Rust private crate registry.
Use cases:
- **Backup:** retrieve a registry server's files for backup storage
- **Export:** pull the files so you can host them at another registry server
- **Migration:** publish downloaded .crate files to a new private registry, including modifying the `Cargo.toml` manifests of each published crate version to make it compatible with the destination registry
## Example Usage:
## Tools
There are two binaries in the repo:
- `registry-backup`: for downloading all .crate files hosted by a Cargo registry server
- `publish`: for publishing the .crate files downloaded by `registry-backup` to a different registry
## `registry-backup`
`registry-backup` is a tool to download all of the .crate files hosted by a Cargo registry server.
### Example Usage
Specify the registry index either as a local path (`--index-path`)...
@ -28,18 +40,18 @@ $ RUST_LOG=info registry-backup \
--auth-token ${AUTH_TOKEN} # for private registry, need auth
```
## Install
### Install
```console
$ cargo install registry-backup --git https://git.shipyard.rs/jstrong/registry-backup.git
```
## Runtime Options
### Runtime Options
```console
$ ./target/release/registry-backup --help
registry-backup 0.4.1
registry-backup 0.5.0-beta.1
Jonathan Strong <jstrong@shipyard.rs>
Download all .crate files from a registry server
@ -82,7 +94,7 @@ OPTIONS:
-U, --user-agent <USER_AGENT>
Value of user-agent HTTP header
[default: registry-backup/v0.4.1]
[default: registry-backup/v0.5.0-beta.1]
-R, --requests-per-second <INT>
Requests to registry server will not exceed this rate
@ -116,7 +128,7 @@ OPTIONS:
```
## Configuration File
### Configuration File
A toml configuration file may be used instead of command line flags. A sample file (`config.toml.sample`) is included. From the example file:
@ -152,6 +164,251 @@ $ just release-build # alternatively, cargo build --bin registry-backup --releas
# cp target/release/registry-backup ~/.cargo/bin/
```
## `publish`
`publish` is a tool to publish all of the crate versions from a *source registry* to second *destination registry*.
### Usage Overview
`publish` is different from `registry-backup` in that in requires several steps, including the use of a Python script.
In general, migrating all of the crate versions to another registry is relatively complex, compared to just downloading the .crate files. Migrating to a new registry involves the following (big picture) steps:
1) extracting the order that crate versions were published to the source registry from the git history of the crate index repository
2) extracting the source files, including `Cargo.toml` manifests, from the downloaded `.crate` files
3) modifying the `Cargo.toml` manifests for each crate version so the crate will be compatible with the destination registry
4) publishing the crate versions, in the right order and using the modified `Cargo.toml` manifests, to the destination registry
### Background Context: `cargo publish`, `.crate` Files, and `Cargo.toml.orig`
When you run the `cargo publish` command to publish a crate version to a registry server, it generates an alternate `Cargo.toml` manifest based on the contents of the original `Cargo.toml` in combination with the configured settings with which the command was invoked.
For example, if you had configured a private registry in `~/.cargo/config.toml`:
```toml
# ~/.cargo/config.toml
[registries.my-private-registry]
index = "ssh://git@ssh.shipyard.rs/my-private-registry/crate-index.git"
```
And then added a dependency from that registry in a `Cargo.toml` for a crate:
```toml
# Cargo.toml
[package]
name = "foo"
publish = ["my-private-registry"]
[dependencies]
bar = { version = "1.0", registry = "my-private-registry" }
```
...`cargo publish` would convert the dependency into one with a hard-coded `registry-index` field that points to the specific index URL that was configured at the time it was invoked:
```
# cargo publish-generated Cargo.toml
[package]
name = "foo"
publish = ["my-private-registry"]
[dependencies]
bar = { version = "1.0", registry-index = "ssh://git@ssh.shipyard.rs/my-private-registry/crate-index.git" }
```
`cargo publish` includes the original `Cargo.toml` file at the path `Cargo.toml.orig` in the `.crate` file (actually a `.tar.gz` archive).
Since the `registry-index` entries generated by `cargo publish` point to the specific URL of the source registry, just publishing the `.crate` file as is to the destination registry will not suffice. To resolve this problem, `publish` uses the `Cargo.toml.orig` file contained in the `.crate` file, modifies the dependency entries according to the settings of the destination registry, and publishes them to the destination registry using `cargo publish` (i.e. discard the `cargo publish`-generated `Cargo.toml`, relying instead on the modified `Cargo.toml.orig` in combination with runtime settings provided as env vars to `cargo`).
### The Global Dependency Graph of a Registry and `publish-log.csv`
Once we have solved how to take a `.crate` file from the source registry and publish it to the destination registry, there is still the issue of which order the crate versions should be published. If crate `a` version 1.2.3 depends on crate `b` version 2.3.4, then crate `b` version 2.3.4 needs to have already been published to the registry at the time crate `a` version 1.2.3 is published, otherwise it will depend on a crate that does not (yet) exist (in the destination registry, at least). If you try to publish crates without respecting this global dependency graph using `cargo publish`, it will exit with an error, and it's not a good idea otherwise, either.
Building a dependency graph for the entire registry is certainly possible, theoretically. However, in practice it is tedious to do, mainly because it requires mirroring `cargo`'s dependency resolution process, just to be able to identify the full set of dependencies that would end up in the `Cargo.lock` file. That, in turn, requires using `cargo` (i.e. via the `cargo metadata` command), which is slow for large registries (only a single `cargo metadata` command can run at a time due to the use of lock files), and quite involved in terms of parsing the programmatically-generated outputs (wow it is amazing how many different forms crate metadata is represented in various `cargo`/registry contexts!).
To shortcut these complexities, `publish` relies on the use of a Python script to extract the order in which crate versions were published to a registry using the git history of the crate index repository.
The tool (`script/get-publish-history.py`) was based on an open source script that utilizes the `GitPython` library to traverse the commit history of a repo. In a few minutes work, we were able to modify the script to extract the publish order of all the crate versions appearing in the crate index repository. And, as much as we love Rust (and do not share the same passion for Python), porting the code to Rust using the `git2` crate appeared like quite a tedious project itself.
To generate a `.csv` file with the order in which crates were published, first clone the crate index repository, e.g.:
```
$ git clone ssh://git@ssh.shipyard.rs/my-private-registry/crate-index.git
```
Then run the script (it has two dependencies `GitPython` and `pandas`, both of which can be `pip install`ed or otherwise acquired using whatever terrible Python package manager you want):
```
$ python script/get-publish-history.py path/to/crate-index > publish-log.csv
```
You will need a `publish-log.csv` generated from the source registry to use `publish`.
(You might be wondering why we are relying on git history to reconstruct the publishing order. The primary reason is the crate index metadata (or any other metadata universally available from a crate registry) does not include any information about when each crate version was published.)
### Detailed Usage Example
##### 1) Clone the source registry crate index repository:
```
$ mkdir source-registry
$ git clone <source registry crate index repo url> source-registry/crate-index
```
##### 2) Use `registry-backup` to download all the `.crate` files from the source registry:
```
$ cargo install registry-backup --git https://git.shipyard.rs/jstrong/registry-backup.git # or build from source
$ RUST_LOG=info registry-backup \
--index-path source-registry/crate-index \
--output-path source-registry/crate-files
```
##### 3) Use the `get-publish-history.py` script to extract the crate version publish history:
```
$ . ../virtualenvs/my-env/activate # or whatever you use
$ pip install GitPython
$ pip install pandas
$ python3 script/get-publish-history.py source-registry/crate-index > source-registry/publish-log.csv
```
##### 4) Create a configuration file:
```toml
# publish-config.toml
# source registry config
[src]
index-dir = "source-registry/crate-index" # <- see step 1
crate-files-dir = "source-registry/crate-files" # <- see step 2
publish-history-csv = "source-registry/publish-log.csv" # <- see step 3
registry-name = "my-old-registry" # <- whatever label the source registry was given in Cargo.toml files
index-url = "https://github.com/my-org/crate-index.git" # <- index url, i.e. same as one provided in ~/.cargo/config.toml
# destination registry config
[dst]
index-url = "ssh://git@ssh.shipyard.rs/my-new-registry/crate-index.git"
registry-name = "my-new-registry" # can be same as old name or a different name
auth-token = "xxx" # auth token for publishing to the destination registry
```
##### 5) Build `publish`:
```
$ cargo bulid --bin publish --features publish --release
```
##### 6) Validate your config file (optional):
```
$ ./target/release/publish --config publish-config.toml --validate
```
##### 7) Publish to the destination registry using `publish`:
```
$ RUST_LOG=info ./target/release/publish --config publish-config.toml
```
### Expected Runtime
As an example, using `publish`, it took us about 50 minutes to migrate a registry with 77 crates and 937 versions. Results may vary based on the machine used to run `publish` as well as the performance of the destination registry server.
### Building `publish` (Full Example)
```
$ git clone https://git.shipyard.rs/jstrong/registry-backup.git
$ cd registry-backup
$ just release-build-publish # alternately, cargo build --bin publish --features publish --release
```
Note: `--release` really is quite a bit faster, at least for larger registries.
### Configuration File
Annotated example configuration file:
```toml
# optional field for providing a regex-based filter
# to limit which crates are published to the destination
# registry. only crates with names matching the regex will
# be published.
#
filter-crates = "^."
# do everything except actually publish to the destination registry
dry-run = false
# source registry config
[src]
index-dir = "path/to/crate-index/repo" # git clone of crate index repository
crate-files-dir = "path/to/crate/files" # i.e. files downloaded by registry-backup tool
publish-history-csv = "path/to/publish-log.csv" # see docs above
registry-name = "my-old-registry" # whatever label the source registry was given in Cargo.toml files
index-url = "https://github.com/my-org/crate-index.git" # index url, i.e. same as one provided in ~/.cargo/config.toml
# destination registry config
[dst]
index-url = "ssh://git@ssh.shipyard.rs/my-new-registry/crate-index.git" # index url of new registry
registry-name = "my-new-registry" # can be same as old name or a different name
auth-token = "xxx" # auth token for publishing to the destination registry
```
### Runtime Options
```console
$ ./target/release/publish --help
registry-backup 0.5.0-beta.1
Jonathan Strong <jstrong@shipyard.rs>
USAGE:
publish [OPTIONS] --config-file <PATH>
OPTIONS:
-c, --config-file <PATH> Config file with source directories and destination registry info
--dry-run Perform all the work of generating `cargo publish` payloads, but
don't send them to the destination registry server
--validate Load config file, validate the settings, and display the final
loaded content to stdout, then exit
--filter-crates <REGEX> Use to limit which crates from the source registry are published
to the destination registry. Expects a regular expression which
will be matched against the names of crates. Only crates with
names that match the regex will be published. This field may also
be specified at the top level of the config file
-h, --help Print help information
-V, --version Print version information
```
### Configuration File
A toml configuration file may be used instead of command line flags. A sample file (`config.toml.sample`) is included. From the example file:
```toml
dry-run = false
filter-crates = "^."
[registry]
index-url = "ssh://git@ssh.shipyard.rs/shipyard-rs-public/crate-index.git"
# alternatively, specify a local dir
# index-path = "/path/to/cloned/index"
auth-token = "xxx"
[http]
user-agent = "registry-backup/v0.1.0"
requests-per-second = 100
max-concurrent-requests = 50
[output]
path = "output"
overwrite-existing = false
format = "{crate}/{version}/download"
```
## Running Tests
```console
@ -171,12 +428,14 @@ Available recipes:
cargo +args='' # cargo wrapper; executes a cargo command using the settings in justfile (RUSTFLAGS, etc.)
check +args='' # cargo check wrapper
debug-build +args='' # cargo build wrapper - builds registry-backup in debug mode
debug-build-publish +args='' # cargo build wrapper - builds publish tool in debug mode
generate-readme # generate updated README.md
get-crate-version
install # cargo install registry-backup via git dep
pre-release # check, run tests, check non-error output for clippy, run rustfmt
release # release version (regenerate docs, git tag v0.0.0)
release-build +args='' # cargo build --release wrapper - builds registry-backup in release mode
release-build-publish +args='' # cargo build --release wrapper - builds publish tool in release mode
release-prep # get everything all ready for release
show-build-env # diagnostic command for viewing value of build variables at runtime
test +args='' # cargo test wrapper
@ -193,7 +452,7 @@ The commands that mirror cargo commands (e.g. `just test`) are included for the
This file is generated using a template (`doc/README.tera.md`) rendered using updated outputs of the CLI menu, config sample, and other values.
This version of `README.md` was generated at `Thu, 08 Dec 2022 02:23:17 +0000` based on git commit `3241e207`.
This version of `README.md` was generated at `Fri, 10 Nov 2023 01:30:48 +0000` based on git commit `4c2a9e5f`.
To (re-)generate the `README.md` file, use the justfile command:

207
doc/README.tera.md

@ -1,13 +1,25 @@
# registry-backup
A command line utility for downloading all .crate files hosted by a Cargo registry server.
Command line utilities for backup, export, and migration of a Rust private crate registry.
Use cases:
- **Backup:** retrieve a registry server's files for backup storage
- **Export:** pull the files so you can host them at another registry server
- **Migration:** publish downloaded .crate files to a new private registry, including modifying the `Cargo.toml` manifests of each published crate version to make it compatible with the destination registry
## Example Usage:
## Tools
There are two binaries in the repo:
- `registry-backup`: for downloading all .crate files hosted by a Cargo registry server
- `publish`: for publishing the .crate files downloaded by `registry-backup` to a different registry
## `registry-backup`
`registry-backup` is a tool to download all of the .crate files hosted by a Cargo registry server.
### Example Usage
Specify the registry index either as a local path (`--index-path`)...
@ -28,13 +40,13 @@ $ RUST_LOG=info registry-backup \
--auth-token ${AUTH_TOKEN} # for private registry, need auth
```
## Install
### Install
```console
$ cargo install registry-backup --git https://git.shipyard.rs/jstrong/registry-backup.git
```
## Runtime Options
### Runtime Options
```console
$ ./target/release/registry-backup --help
@ -42,7 +54,7 @@ $ ./target/release/registry-backup --help
{{ cli_menu }}
```
## Configuration File
### Configuration File
A toml configuration file may be used instead of command line flags. A sample file (`config.toml.sample`) is included. From the example file:
@ -60,6 +72,191 @@ $ just release-build # alternatively, cargo build --bin registry-backup --releas
# cp target/release/registry-backup ~/.cargo/bin/
```
## `publish`
`publish` is a tool to publish all of the crate versions from a *source registry* to second *destination registry*.
### Usage Overview
`publish` is different from `registry-backup` in that in requires several steps, including the use of a Python script.
In general, migrating all of the crate versions to another registry is relatively complex, compared to just downloading the .crate files. Migrating to a new registry involves the following (big picture) steps:
1) extracting the order that crate versions were published to the source registry from the git history of the crate index repository
2) extracting the source files, including `Cargo.toml` manifests, from the downloaded `.crate` files
3) modifying the `Cargo.toml` manifests for each crate version so the crate will be compatible with the destination registry
4) publishing the crate versions, in the right order and using the modified `Cargo.toml` manifests, to the destination registry
### Background Context: `cargo publish`, `.crate` Files, and `Cargo.toml.orig`
When you run the `cargo publish` command to publish a crate version to a registry server, it generates an alternate `Cargo.toml` manifest based on the contents of the original `Cargo.toml` in combination with the configured settings with which the command was invoked.
For example, if you had configured a private registry in `~/.cargo/config.toml`:
```toml
# ~/.cargo/config.toml
[registries.my-private-registry]
index = "ssh://git@ssh.shipyard.rs/my-private-registry/crate-index.git"
```
And then added a dependency from that registry in a `Cargo.toml` for a crate:
```toml
# Cargo.toml
[package]
name = "foo"
publish = ["my-private-registry"]
[dependencies]
bar = { version = "1.0", registry = "my-private-registry" }
```
...`cargo publish` would convert the dependency into one with a hard-coded `registry-index` field that points to the specific index URL that was configured at the time it was invoked:
```
# cargo publish-generated Cargo.toml
[package]
name = "foo"
publish = ["my-private-registry"]
[dependencies]
bar = { version = "1.0", registry-index = "ssh://git@ssh.shipyard.rs/my-private-registry/crate-index.git" }
```
`cargo publish` includes the original `Cargo.toml` file at the path `Cargo.toml.orig` in the `.crate` file (actually a `.tar.gz` archive).
Since the `registry-index` entries generated by `cargo publish` point to the specific URL of the source registry, just publishing the `.crate` file as is to the destination registry will not suffice. To resolve this problem, `publish` uses the `Cargo.toml.orig` file contained in the `.crate` file, modifies the dependency entries according to the settings of the destination registry, and publishes them to the destination registry using `cargo publish` (i.e. discard the `cargo publish`-generated `Cargo.toml`, relying instead on the modified `Cargo.toml.orig` in combination with runtime settings provided as env vars to `cargo`).
### The Global Dependency Graph of a Registry and `publish-log.csv`
Once we have solved how to take a `.crate` file from the source registry and publish it to the destination registry, there is still the issue of which order the crate versions should be published. If crate `a` version 1.2.3 depends on crate `b` version 2.3.4, then crate `b` version 2.3.4 needs to have already been published to the registry at the time crate `a` version 1.2.3 is published, otherwise it will depend on a crate that does not (yet) exist (in the destination registry, at least). If you try to publish crates without respecting this global dependency graph using `cargo publish`, it will exit with an error, and it's not a good idea otherwise, either.
Building a dependency graph for the entire registry is certainly possible, theoretically. However, in practice it is tedious to do, mainly because it requires mirroring `cargo`'s dependency resolution process, just to be able to identify the full set of dependencies that would end up in the `Cargo.lock` file. That, in turn, requires using `cargo` (i.e. via the `cargo metadata` command), which is slow for large registries (only a single `cargo metadata` command can run at a time due to the use of lock files), and quite involved in terms of parsing the programmatically-generated outputs (wow it is amazing how many different forms crate metadata is represented in various `cargo`/registry contexts!).
To shortcut these complexities, `publish` relies on the use of a Python script to extract the order in which crate versions were published to a registry using the git history of the crate index repository.
The tool (`script/get-publish-history.py`) was based on an open source script that utilizes the `GitPython` library to traverse the commit history of a repo. In a few minutes work, we were able to modify the script to extract the publish order of all the crate versions appearing in the crate index repository. And, as much as we love Rust (and do not share the same passion for Python), porting the code to Rust using the `git2` crate appeared like quite a tedious project itself.
To generate a `.csv` file with the order in which crates were published, first clone the crate index repository, e.g.:
```
$ git clone ssh://git@ssh.shipyard.rs/my-private-registry/crate-index.git
```
Then run the script (it has two dependencies `GitPython` and `pandas`, both of which can be `pip install`ed or otherwise acquired using whatever terrible Python package manager you want):
```
$ python script/get-publish-history.py path/to/crate-index > publish-log.csv
```
You will need a `publish-log.csv` generated from the source registry to use `publish`.
(You might be wondering why we are relying on git history to reconstruct the publishing order. The primary reason is the crate index metadata (or any other metadata universally available from a crate registry) does not include any information about when each crate version was published.)
### Detailed Usage Example
##### 1) Clone the source registry crate index repository:
```
$ mkdir source-registry
$ git clone <source registry crate index repo url> source-registry/crate-index
```
##### 2) Use `registry-backup` to download all the `.crate` files from the source registry:
```
$ cargo install registry-backup --git https://git.shipyard.rs/jstrong/registry-backup.git # or build from source
$ RUST_LOG=info registry-backup \
--index-path source-registry/crate-index \
--output-path source-registry/crate-files
```
##### 3) Use the `get-publish-history.py` script to extract the crate version publish history:
```
$ . ../virtualenvs/my-env/activate # or whatever you use
$ pip install GitPython
$ pip install pandas
$ python3 script/get-publish-history.py source-registry/crate-index > source-registry/publish-log.csv
```
##### 4) Create a configuration file:
```toml
# publish-config.toml
# source registry config
[src]
index-dir = "source-registry/crate-index" # <- see step 1
crate-files-dir = "source-registry/crate-files" # <- see step 2
publish-history-csv = "source-registry/publish-log.csv" # <- see step 3
registry-name = "my-old-registry" # <- whatever label the source registry was given in Cargo.toml files
index-url = "https://github.com/my-org/crate-index.git" # <- index url, i.e. same as one provided in ~/.cargo/config.toml
# destination registry config
[dst]
index-url = "ssh://git@ssh.shipyard.rs/my-new-registry/crate-index.git"
registry-name = "my-new-registry" # can be same as old name or a different name
auth-token = "xxx" # auth token for publishing to the destination registry
```
##### 5) Build `publish`:
```
$ cargo bulid --bin publish --features publish --release
```
##### 6) Validate your config file (optional):
```
$ ./target/release/publish --config publish-config.toml --validate
```
##### 7) Publish to the destination registry using `publish`:
```
$ RUST_LOG=info ./target/release/publish --config publish-config.toml
```
### Expected Runtime
As an example, using `publish`, it took us about 50 minutes to migrate a registry with 77 crates and 937 versions. Results may vary based on the machine used to run `publish` as well as the performance of the destination registry server.
### Building `publish` (Full Example)
```
$ git clone https://git.shipyard.rs/jstrong/registry-backup.git
$ cd registry-backup
$ just release-build-publish # alternately, cargo build --bin publish --features publish --release
```
Note: `--release` really is quite a bit faster, at least for larger registries.
### Configuration File
Annotated example configuration file:
```toml
{{ publish_config_sample }}
```
### Runtime Options
```console
$ ./target/release/publish --help
{{ publish_cli_menu }}
```
### Configuration File
A toml configuration file may be used instead of command line flags. A sample file (`config.toml.sample`) is included. From the example file:
```toml
{{ config_sample }}
```
## Running Tests
```console

4
doc/cli-menu.txt

@ -1,4 +1,4 @@
registry-backup 0.4.1
registry-backup 0.5.0-beta.1
Jonathan Strong <jstrong@shipyard.rs>
Download all .crate files from a registry server
@ -41,7 +41,7 @@ OPTIONS:
-U, --user-agent <USER_AGENT>
Value of user-agent HTTP header
[default: registry-backup/v0.4.1]
[default: registry-backup/v0.5.0-beta.1]
-R, --requests-per-second <INT>
Requests to registry server will not exceed this rate

2
doc/just-commands.txt

@ -2,12 +2,14 @@ Available recipes:
cargo +args='' # cargo wrapper; executes a cargo command using the settings in justfile (RUSTFLAGS, etc.)
check +args='' # cargo check wrapper
debug-build +args='' # cargo build wrapper - builds registry-backup in debug mode
debug-build-publish +args='' # cargo build wrapper - builds publish tool in debug mode
generate-readme # generate updated README.md
get-crate-version
install # cargo install registry-backup via git dep
pre-release # check, run tests, check non-error output for clippy, run rustfmt
release # release version (regenerate docs, git tag v0.0.0)
release-build +args='' # cargo build --release wrapper - builds registry-backup in release mode
release-build-publish +args='' # cargo build --release wrapper - builds publish tool in release mode
release-prep # get everything all ready for release
show-build-env # diagnostic command for viewing value of build variables at runtime
test +args='' # cargo test wrapper

19
doc/publish-cli-menu.txt

@ -0,0 +1,19 @@
registry-backup 0.5.0-beta.1
Jonathan Strong <jstrong@shipyard.rs>
USAGE:
publish [OPTIONS] --config-file <PATH>
OPTIONS:
-c, --config-file <PATH> Config file with source directories and destination registry info
--dry-run Perform all the work of generating `cargo publish` payloads, but
don't send them to the destination registry server
--validate Load config file, validate the settings, and display the final
loaded content to stdout, then exit
--filter-crates <REGEX> Use to limit which crates from the source registry are published
to the destination registry. Expects a regular expression which
will be matched against the names of crates. Only crates with
names that match the regex will be published. This field may also
be specified at the top level of the config file
-h, --help Print help information
-V, --version Print version information

10
justfile

@ -26,10 +26,20 @@ debug-build +args='':
release-build +args='':
@just cargo build --bin registry-backup --release {{args}}
# cargo build wrapper - builds publish tool in debug mode
debug-build-publish +args='':
@just cargo build --bin publish {{args}}
# cargo build --release wrapper - builds publish tool in release mode
release-build-publish +args='':
@just cargo build --bin publish --release {{args}}
# generate updated README.md
generate-readme:
just debug-build
./target/debug/registry-backup --help > doc/cli-menu.txt
just debug-build-publish
./target/debug/publish --help > doc/publish-cli-menu.txt
just --list > doc/just-commands.txt
just cargo run --bin generate-readme --features docs

23
publish-config.toml.sample

@ -0,0 +1,23 @@
# optional field for providing a regex-based filter
# to limit which crates are published to the destination
# registry. only crates with names matching the regex will
# be published.
#
filter-crates = "^."
# do everything except actually publish to the destination registry
dry-run = false
# source registry config
[src]
index-dir = "path/to/crate-index/repo" # git clone of crate index repository
crate-files-dir = "path/to/crate/files" # i.e. files downloaded by registry-backup tool
publish-history-csv = "path/to/publish-log.csv" # see docs above
registry-name = "my-old-registry" # whatever label the source registry was given in Cargo.toml files
index-url = "https://github.com/my-org/crate-index.git" # index url, i.e. same as one provided in ~/.cargo/config.toml
# destination registry config
[dst]
index-url = "ssh://git@ssh.shipyard.rs/my-new-registry/crate-index.git" # index url of new registry
registry-name = "my-new-registry" # can be same as old name or a different name
auth-token = "xxx" # auth token for publishing to the destination registry

92
script/get-publish-history.py

@ -0,0 +1,92 @@
# std
import os
import sys
import io
from pathlib import Path
import json
# non-std
import git
import pandas as pd
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
def versions(path, branch='master'):
"""
This function returns a generator which iterates through all commits of
the repository located in the given path for the given branch. It yields
file diff information to show a timeseries of file changes.
"""
# Create the repository, raises an error if it isn't one.
repo = git.Repo(path)
# Iterate through every commit for the given branch in the repository
for commit in repo.iter_commits(branch):
# Determine the parent of the commit to diff against.
# If no parent, this is the first commit, so use empty tree.
# Then create a mapping of path to diff for each file changed.
parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA
diffs = {
diff.a_path: diff for diff in commit.diff(parent)
}
# The stats on the commit is a summary of all the changes for this
# commit, we'll iterate through it to get the information we need.
for objpath, stats in commit.stats.files.items():
# Select the diff for the path in the stats
diff = diffs.get(objpath)
# If the path is not in the dictionary, it's because it was
# renamed, so search through the b_paths for the current name.
if not diff:
for diff in diffs.values():
if diff.b_path == path and diff.renamed:
break
p = Path(objpath)
if len(p.parts) != 3:
print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr)
continue
try:
obj = commit.tree / objpath
with io.BytesIO(obj.data_stream.read()) as f:
lastline = list(f.readlines())[-1].decode('utf-8')
except Exception as e:
print(f'failed to load file at commit {commit}', file=sys.stderr)
continue
lastline = lastline.strip()
try:
d = json.loads(lastline)
except Exception as e:
print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr)
continue
row = {
'path': os.path.join(path, objpath),
'commit': commit.hexsha,
'author': commit.author.email,
'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT),
'crate_name': d['name'],
'version': d['vers'],
}
yield row
def main(path):
df = pd.DataFrame(versions(path))
df['time'] = pd.to_datetime(df['time'], utc=True)
df = df.sort_values(by='time').groupby(['crate_name', 'version']).last().reset_index()
buf = io.StringIO()
df.to_csv(buf, index=False)
print(buf.getvalue())
if __name__ == '__main__':
if len(sys.argv) == 1 or any(a == '-h' or a == '--help' for a in sys.argv):
print("USAGE:\n python3 get-publish-history.py PATH\n", file=sys.stderr)
else:
path = sys.argv[1]
main(path)

4
src/generate-readme.rs

@ -2,8 +2,10 @@ use chrono::prelude::*;
const README_TEMPLATE: &str = include_str!("../doc/README.tera.md");
const CLI_MENU: &str = include_str!("../doc/cli-menu.txt");
const PUBLISH_CLI_MENU: &str = include_str!("../doc/publish-cli-menu.txt");
const JUST_COMMANDS: &str = include_str!("../doc/just-commands.txt");
const CONFIG_SAMPLE: &str = include_str!("../config.toml.sample");
const PUBLISH_CONFIG_SAMPLE: &str = include_str!("../publish-config.toml.sample");
fn get_commit_ref() -> Result<String, Box<dyn std::error::Error>> {
let output = std::process::Command::new("git")
@ -21,7 +23,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
tera.add_raw_template("README.md", README_TEMPLATE).unwrap();
let mut ctx = tera::Context::new();
ctx.insert("cli_menu", CLI_MENU);
ctx.insert("publish_cli_menu", PUBLISH_CLI_MENU);
ctx.insert("config_sample", CONFIG_SAMPLE);
ctx.insert("publish_config_sample", PUBLISH_CONFIG_SAMPLE);
ctx.insert("just_commands", JUST_COMMANDS);
ctx.insert("git_commit", &get_commit_ref()?);
ctx.insert("generation_time", &Utc::now().to_rfc2822());

53
src/main.rs

@ -15,6 +15,7 @@ use governor::{Quota, RateLimiter};
use reqwest::header::AUTHORIZATION; // ACCEPT, CONTENT_TYPE};
use serde::Deserialize;
use tokio::io::AsyncBufReadExt;
use tokio::time::timeout;
use tracing::{debug, error, info, warn};
use tracing_subscriber::filter::EnvFilter;
@ -87,7 +88,7 @@ impl RegistryConfig {
out
} else {
Path::new(&self.dl)
.join(&format!(
.join(format!(
"{name}/{version}/download",
name = name,
version = version,
@ -376,12 +377,12 @@ fn setup_logger() {
builder.init();
}
async fn load_config_file(config: Config) -> Result<Config, AnyError> {
match config.config_file.as_ref() {
async fn load_config_file(opt: Config) -> Result<Config, AnyError> {
match opt.config_file.as_ref() {
Some(path) => {
debug!(?path, "loading config file");
let toml = tokio::fs::read_to_string(&path).await?;
let config: Config = match toml::from_str(&toml) {
let mut config: Config = match toml::from_str(&toml) {
Ok(c) => c,
Err(e) => panic!(
"\nfatal error: parsing config file at {} failed:\n\n{}\n\n",
@ -389,10 +390,11 @@ async fn load_config_file(config: Config) -> Result<Config, AnyError> {
e
),
};
config.dry_run |= opt.dry_run;
Ok(config)
}
None => Ok(config),
None => Ok(opt),
}
}
@ -511,7 +513,7 @@ async fn get_crate_versions(
let crate_versions: Vec<CrateVersion> = crate_versions
.into_iter()
.flat_map(|result| match result {
Ok(xs) => xs.into_iter().filter(|x| x.name != "vst").collect(),
Ok(xs) => xs.into_iter().collect(),
Err(e) => {
error!(err = ?e, "parsing metadata failed, skipping file");
vec![]
@ -549,6 +551,23 @@ async fn ensure_dir_exists<P: AsRef<std::path::Path>>(path: P) -> Result<(), Any
}
}
async fn verify_dir_exists<P: AsRef<std::path::Path>>(path: P) -> Result<(), AnyError> {
match tokio::fs::metadata(path.as_ref()).await {
Ok(meta) if meta.is_dir() => Ok(()),
Ok(meta) /* if ! meta.is_dir() */ => {
debug_assert!( ! meta.is_dir());
Err(format!("path exists, but is not a directory: {:?}", path.as_ref()).into())
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
Err(format!("path does not exist: {}", path.as_ref().display()).into())
}
Err(e) => Err(e.into()),
}
}
async fn ensure_file_parent_dir_exists<P: AsRef<std::path::Path>>(path: P) -> Result<(), AnyError> {
if let Some(parent_dir) = path.as_ref().parent() {
ensure_dir_exists(parent_dir).await
@ -665,9 +684,9 @@ async fn download_versions(
req
};
let resp = req.send().await?;
let resp = timeout(Duration::from_secs(10), req.send()).await??;
let status = resp.status();
let body = resp.bytes().await?;
let body = timeout(Duration::from_secs(10), resp.bytes()).await??;
if !status.is_success() {
error!(status = ?status, "download failed");
@ -756,7 +775,10 @@ async fn run(config: Config) -> Result<(), AnyError> {
tmp
}
(_, Some(path)) => path,
(_, Some(path)) => {
verify_dir_exists(&path).await?;
path
}
_ => unreachable!(),
};
@ -770,23 +792,28 @@ async fn run(config: Config) -> Result<(), AnyError> {
Ok(())
}
fn main() {
fn main() -> Result<(), anyhow::Error> {
let begin = Instant::now();
dotenvy::dotenv().ok();
setup_logger();
info!("initializing...");
let config = Config::parse();
info!("initializing...");
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
rt.block_on(run(config)).unwrap();
if let Err(err) = rt.block_on(run(config)) {
error!(?err, "something went wrong!");
anyhow::bail!("fatal error: {:?}", err);
}
info!("finished in {:?}", begin.elapsed());
Ok(())
}
#[cfg(test)]

896
src/publish.rs

@ -0,0 +1,896 @@
#![allow(unused_labels)]
use std::collections::{BTreeMap, HashMap};
use std::io::{self, prelude::*};
use std::path::{Path, PathBuf};
use std::time::*;
use anyhow::{anyhow, bail, Context, Error};
use chrono::prelude::*;
use clap::Parser;
use convert_case::{Case, Casing};
use futures::stream::StreamExt;
use rayon::prelude::*;
use semver::Version;
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
use tokio::io::AsyncBufReadExt;
use tracing::{debug, error, info, trace, warn};
use tracing_subscriber::filter::EnvFilter;
#[derive(Parser, Debug)]
#[clap(author, version, global_setting(clap::AppSettings::DeriveDisplayOrder))]
struct Opt {
/// Config file with source directories and destination registry info
#[clap(short, long, value_name = "PATH")]
pub config_file: PathBuf,
/// Perform all the work of generating `cargo publish` payloads,
/// but don't send them to the destination registry server
#[clap(long)]
pub dry_run: bool,
/// Load config file, validate the settings, and display the final loaded content
/// to stdout, then exit
#[clap(long)]
pub validate: bool,
/// Use to limit which crates from the source registry are published to the
/// destination registry. Expects a regular expression which will be matched
/// against the names of crates. Only crates with names that match the regex
/// will be published. This field may also be specified at the top level of
/// the config file.
#[clap(long, value_name = "REGEX", alias = "filter")]
pub filter_crates: Option<String>,
/// Option to limit the number of crate versions published. If set, when the
/// number is reached the program will exit.
#[clap(long, short = 'n', value_name = "INT")]
pub limit: Option<usize>,
/// Option to skip a number of crate versions at the beginning of the dependency
/// order, for use if resuming. The skipped versions will not count as part of
/// the optional --limit.
#[clap(long, value_name = "INT")]
pub skip: Option<usize>,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "kebab-case")]
struct DestinationRegistryConfig {
/// Registry index url, i.e. the url provided to Cargo via configuration
/// to identify where to pull the index metadata from.
#[serde(alias = "index")]
pub index_url: String,
#[serde(alias = "token")]
pub auth_token: String,
/// The name the registry should have in the Cargo.toml files published to
/// the destination registry. This can be a rename (i.e. different than the
/// registry name provided in `SourceRegistryConfig`) or the same name.
pub registry_name: String,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "kebab-case")]
struct SourceRegistryConfig {
#[serde(alias = "index")]
pub index_dir: PathBuf,
#[serde(alias = "crate-files")]
pub crate_files_dir: PathBuf,
/// Name used in Cargo.toml for dependencies from the registry.
pub registry_name: String,
/// Path of CSV file with log of when each crate version was published.
pub publish_history_csv: PathBuf,
pub index_url: String,
}
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "kebab-case")]
struct Config {
/// Do everything except actually publish to the destination registry. Can also be
/// toggled using the --dry-run command line flag.
#[serde(default)]
pub dry_run: bool,
/// Local directories with source registry files
#[serde(alias = "source")]
pub src: SourceRegistryConfig,
/// Server information and authentication needed to publish to the
/// destination registry
#[serde(alias = "destination")]
pub dst: DestinationRegistryConfig,
/// Use to limit which crates from the source registry are published to the
/// destination registry. Expects a regular expression which will be matched
/// against the names of crates. Only crates with names that match the regex
/// will be published.
#[serde(default, alias = "filter")]
pub filter_crates: Option<String>,
}
impl Config {
pub fn compile_filter(&self) -> Result<Option<regex::Regex>, Error> {
match self.filter_crates.as_ref() {
Some(regex) => {
let compiled = regex::Regex::new(regex).map_err(|e| {
error!(%regex, err = ?e, "regex failed to compile: {}", e);
e
})?;
Ok(Some(compiled))
}
None => Ok(None),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
struct PublishLogRow {
pub crate_name: String,
pub version: Version,
pub path: PathBuf,
pub commit: String,
pub author: String,
pub time: DateTime<Utc>,
// pub unix_nanos: u64,
}
struct CsvSetup {
pub rdr: csv::Reader<io::BufReader<std::fs::File>>,
pub headers: csv::ByteRecord,
pub row: csv::ByteRecord,
}
fn csv_setup(path: &Path) -> Result<CsvSetup, Error> {
verify_file_exists(path)?;
let file = std::fs::File::open(path)?;
let buf = std::io::BufReader::new(file);
let mut rdr = csv::Reader::from_reader(buf);
let headers = rdr
.byte_headers()
.map_err(|e| anyhow!("failed to parse csv headers: {}", e))?
.clone();
let row = csv::ByteRecord::new();
Ok(CsvSetup { rdr, headers, row })
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
struct PublishDependency {
pub optional: bool,
pub default_features: bool,
pub name: String,
pub features: Vec<String>,
// cargo and crates-io have this as string
#[serde(alias = "req")]
pub version_req: semver::VersionReq,
pub target: Option<String>,
// crates-io has this as option
pub kind: DependencyKind,
#[serde(skip_serializing_if = "Option::is_none")]
pub registry: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub explicit_name_in_toml: Option<String>,
}
impl From<IndexDependency> for PublishDependency {
fn from(dep: IndexDependency) -> Self {
Self {
name: dep.name,
features: dep.features,
default_features: dep.default_features,
optional: dep.optional,
target: dep.target,
kind: dep.kind,
registry: dep.registry,
version_req: dep.req,
explicit_name_in_toml: dep.package,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
struct IndexMeta {
// same everything as publish metadata
pub name: String,
#[serde(alias = "version")]
pub vers: semver::Version,
#[serde(alias = "dependencies")]
pub features: BTreeMap<String, Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub links: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub badges: Option<BTreeMap<String, String>>,
// modified format/field names
pub deps: Vec<IndexDependency>,
// fields that don't appear in publish metadata
pub cksum: String,
pub yanked: bool,
// ancient fields, these were actually written
// on sanskrit on stone tablets
#[serde(skip_serializing_if = "Option::is_none")]
pub features2: Option<BTreeMap<String, Vec<String>>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub v: Option<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
struct IndexDependency {
/// corresponds to `explicit_name_in_toml` field in `publish::Dependency`
/// when a dep is renamed in Cargo.toml, otherwise same as `package`.
pub name: String,
/// corresponds to `name` in `publish::Dependency`
#[serde(skip_serializing_if = "Option::is_none")]
pub package: Option<String>,
/// in publish meta, this field is called `version_req`, and the index
/// format requires it to be renamed to `req`
#[serde(alias = "version_req")]
pub req: semver::VersionReq,
pub features: Vec<String>,
pub optional: bool,
pub default_features: bool,
pub target: Option<String>,
pub kind: DependencyKind,
#[serde(skip_serializing_if = "Option::is_none")]
pub registry: Option<String>,
}
/// Section in which this dependency was defined
#[derive(Copy, Clone, Serialize, Deserialize, Debug, PartialEq, PartialOrd, Ord, Eq, Hash)]
#[serde(rename_all = "lowercase")]
pub enum DependencyKind {
/// Used at run time
Normal,
/// Used at build time, not available at run time
Build,
/// Not fetched and not used, except for when used direclty in a workspace
Dev,
}
fn extract_manifest_files_from_tar<R: Read>(rdr: R) -> Result<ManifestFiles, Error> {
let mut archive = tar::Archive::new(rdr);
let mut cargo_toml = None;
let mut cargo_toml_orig = None;
let mut cargo_lock = None;
for entry in archive.entries()? {
let mut entry = entry?;
let path = entry.path()?;
if path.ends_with("Cargo.toml.orig") {
let mut data = String::new();
entry.read_to_string(&mut data)?;
cargo_toml_orig = Some(data);
} else if path.ends_with("Cargo.toml") {
let mut data = String::new();
entry.read_to_string(&mut data)?;
cargo_toml = Some(data);
} else if path.ends_with("Cargo.lock") {
let mut data = String::new();
entry.read_to_string(&mut data)?;
cargo_lock = Some(data);
}
if cargo_toml.is_some() && cargo_toml_orig.is_some() && cargo_lock.is_some() {
break;
}
}
if !(cargo_toml.is_some() && cargo_toml_orig.is_some()) {
anyhow::bail!(
"some required manifest files missing in .crate archive \
(cargo_toml={:?} cargo_toml_orig={:?} cargo_lock={:?})",
cargo_toml.is_some(),
cargo_toml_orig.is_some(),
cargo_lock.is_some(),
);
}
Ok(ManifestFiles {
cargo_toml: cargo_toml.unwrap(),
cargo_toml_orig: cargo_toml_orig.unwrap(),
cargo_lock,
})
}
fn setup_logger() {
let env_filter = EnvFilter::from_default_env();
let builder = tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_ansi(true);
builder.init();
}
fn load_config_file(opt: &Opt) -> Result<Config, Error> {
if !opt.config_file.exists() {
bail!("path does not exist: {:?}", opt.config_file);
}
let toml = std::fs::read_to_string(&opt.config_file)?;
let mut config: Config = toml::from_str(&toml).context(
"read config file, but unable to parse toml - check \
format against example config",
)?;
// augment using command line opts
config.filter_crates = config.filter_crates.or_else(|| opt.filter_crates.clone());
config.dry_run |= opt.dry_run;
Ok(config)
}
fn is_hidden(entry: &walkdir::DirEntry) -> bool {
entry
.file_name()
.to_str()
.map(|s| s.starts_with('.'))
.unwrap_or(false)
}
async fn get_index_metas(config: &Config) -> Result<HashMap<String, Vec<IndexMeta>>, Error> {
let filter = config.compile_filter()?;
let mut n_excl = 0;
let files: Vec<(String, PathBuf)> = walkdir::WalkDir::new(&config.src.index_dir)
.max_depth(3)
.into_iter()
.filter_entry(|e| !is_hidden(e))
.filter_map(|res| match res {
Ok(entry) => {
if entry.file_type().is_file() && entry.depth() >= 2 && entry.depth() <= 3 {
let path = entry.into_path();
let crate_name: &str = path.file_name().and_then(|x| x.to_str()).unwrap_or("");
if let Some(filter) = filter.as_ref() {
if !filter.is_match(crate_name.as_ref()) {
trace!(%crate_name, "crate excluded by filter");
n_excl += 1;
return None;
}
}
debug!(?path, "found crate index metadata file to parse");
Some((crate_name.to_owned(), path))
} else {
None
}
}
Err(e) => {
warn!(error = ?e, "walkdir result is error");
None
}
})
.collect();
let n_files = files.len();
info!("found {} crate index metadata files to parse", n_files);
if n_excl > 0 {
warn!(
regex = %config.filter_crates.as_deref().unwrap_or(""),
n_files,
n_excl,
"regex filter (--filter-crates) excluded {} crates", n_excl,
);
}
let crate_versions: Vec<Result<(String, Vec<IndexMeta>), Error>> =
futures::stream::iter(files.into_iter().map(|(crate_name, path)| async move {
let file = tokio::fs::File::open(&path).await.map_err(|e| {
error!(err = ?e, ?path, "failed to open file");
e
})?;
let buf = tokio::io::BufReader::new(file);
let mut out = Vec::new();
let mut lines = buf.lines();
'lines: while let Some(line) = lines.next_line().await? {
let index_meta: IndexMeta = serde_json::from_str(&line).map_err(|e| {
error!(err = ?e, ?path, "failed to parse line");
e
})?;
out.push(index_meta);
}
debug!(crate_name = %out.first().map(|x| x.name.as_str()).unwrap_or("na"),
"parsed {} crate versions from metadata file", out.len()
);
Ok((crate_name, out))
}))
.buffer_unordered(num_cpus::get())
.collect()
.await;
let mut total_number_of_crate_versions = 0;
// map of crate-name => [IndexMeta] (one per published version)
let crate_versions: HashMap<String, Vec<IndexMeta>> = crate_versions
.into_iter()
.filter_map(|result| match result {
Ok((crate_name, xs)) => {
total_number_of_crate_versions += xs.len();
Some((crate_name, xs))
}
Err(e) => {
error!(err = ?e, "parsing metadata failed, skipping file");
None
}
})
.collect();
info!(
n_files,
n_excl,
n_crates = crate_versions.len(),
total_number_of_crate_versions,
"parsed {} crate version metadata entries from index",
total_number_of_crate_versions,
);
Ok(crate_versions)
}
#[derive(Debug, Clone, Deserialize, Eq, PartialEq, Default)]
struct PublishWarnings {
#[serde(default)]
pub invalid_categories: Vec<String>,
#[serde(default)]
pub invalid_badges: Vec<String>,
#[serde(default)]
pub other: Vec<String>,
}
#[derive(Debug, Clone, Deserialize, Eq, PartialEq, Default)]
struct PublishResponse {
#[serde(default)]
pub warnings: PublishWarnings,
}
struct ManifestFiles {
#[allow(dead_code)]
cargo_toml: String,
cargo_toml_orig: String,
#[allow(dead_code)]
cargo_lock: Option<String>,
}
struct VersionMeta {
index_meta: IndexMeta,
manifest_files: ManifestFiles,
tmp: TempDir,
modified_manifest_toml: Option<String>,
}
impl VersionMeta {
pub fn source_dir(&self) -> PathBuf {
self.tmp
.path()
.join(format!("{}-{}", self.index_meta.name, self.index_meta.vers))
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct Node<'a> {
name: &'a str,
vers: Version,
}
fn parse_one_manifest(
config: &Config,
crate_name: &str,
index_meta: IndexMeta,
) -> Result<VersionMeta, Error> {
let version = index_meta.vers.clone();
trace!(%crate_name, %version, "processing crate version");
let dot_crate_path = config
.src
.crate_files_dir
.join(format!("{}/{}/download", crate_name, index_meta.vers));
verify_file_exists(&dot_crate_path)?;
trace!(path = ?dot_crate_path, "reading .crate file");
let dot_crate_bytes = std::fs::read(&dot_crate_path).with_context(|| {
format!(
"failed to read .crate file for \
{crate_name} v{0} with path {dot_crate_path:?}",
index_meta.vers,
)
})?;
trace!("extracting Cargo.toml from .crate targz archive");
let decoder = flate2::read::GzDecoder::new(&dot_crate_bytes[..]);
let manifest_files = extract_manifest_files_from_tar(decoder).map_err(|err| {
error!(%crate_name, vers = %index_meta.vers, ?err, "failed to extract manifest files");
err
})?;
let tmp = TempDir::new()?;
let decoder = flate2::read::GzDecoder::new(&dot_crate_bytes[..]);
tar::Archive::new(decoder)
.unpack(tmp.path())
.map_err(|err| {
error!(%crate_name, vers = %index_meta.vers, ?err, "failed to unpack to temp dir");
err
})?;
trace!(tmpdir = ?tmp.path(), "unpacked .crate archive to temp dir");
let target_dir = tmp.path().join("target");
std::fs::create_dir(target_dir)?;
Ok(VersionMeta {
index_meta,
manifest_files,
tmp,
modified_manifest_toml: None,
})
}
fn parse_manifests(
config: &Config,
crate_versions: HashMap<String, Vec<IndexMeta>>,
) -> Result<HashMap<String, Vec<VersionMeta>>, Error> {
let begin = Instant::now();
let out: HashMap<String, Vec<VersionMeta>> = crate_versions
.into_par_iter()
.filter_map(|(crate_name, versions)| -> Option<(String, Vec<VersionMeta>)> {
let begin = Instant::now();
debug!(%crate_name, "parsing manifests");
let mut version_metas = Vec::new();
let mut n_err = 0;
for index_meta in versions {
let version = index_meta.vers.clone();
match parse_one_manifest(config, &crate_name, index_meta) {
Ok(meta) => {
version_metas.push(meta);
}
Err(err) => {
error!(?err, %crate_name, %version, "failed to parse manifest; skipping");
n_err += 1;
}
}
}
debug!(%crate_name, n_err, "parsed {} manifests in {:?}", version_metas.len(), begin.elapsed());
if version_metas.is_empty() {
warn!(%crate_name, n_err, "parsed zero manifests successfully for crate!");
None
} else {
Some((crate_name, version_metas))
}
}).collect();
info!("parsed crate version manifests in {:?}", begin.elapsed());
Ok(out)
}
/// edit registry deps to point to the destination registry.
///
/// NOTE: recursive traversing of the toml is needed to handle things
/// like conditional deps blocks like:
///
/// ```toml,ignore
/// [target.'cfg(not(target_env = "msvc"))'.dependencies]
/// dep-one = { version = "0.1.0", registry = "old-registry" }
/// ```
fn edit_deps(manifest: &mut toml_edit::Document, config: &Config) {
use toml_edit::{visit_mut::VisitMut, TableLike};
struct DepsVisitor<'a>(&'a Config);
impl<'a> VisitMut for DepsVisitor<'a> {
fn visit_table_like_mut(&mut self, dep: &mut dyn TableLike) {
let config = self.0;
let src_registry_name = config.src.registry_name.as_str();
let dst_registry_name = config.dst.registry_name.as_str();
let src_index_url = config.src.index_url.as_str();
let dst_index_url = config.dst.index_url.as_str();
let mut edited = false;
if let Some(registry_item) = dep.get_mut("registry") {
if registry_item.as_str().unwrap_or("") == src_registry_name {
*registry_item = toml_edit::value(dst_registry_name);
edited = true;
}
}
if let Some(registry_index_item) = dep.get_mut("registry-index") {
if registry_index_item.as_str().unwrap_or("") == src_index_url {
*registry_index_item = toml_edit::value(dst_index_url);
edited = true;
}
}
if !edited {
for (_, v) in dep.iter_mut() {
if let Some(t) = v.as_table_like_mut() {
toml_edit::visit_mut::visit_table_like_mut(self, t);
}
}
}
}
}
let mut visitor = DepsVisitor(config);
visitor.visit_document_mut(&mut *manifest);
}
fn edit_publish_registry(
manifest: &mut toml_edit::Document,
src_registry_name: &str,
dst_registry_name: &str,
) -> Result<(), Error> {
let Some(package) = manifest
.get_mut("package")
.and_then(|item| item.as_table_like_mut())
else {
anyhow::bail!("package key not found in manifest toml");
};
let Some(publish_item) = package.get_mut("publish") else {
trace!("no 'publish' key in Cargo.toml package section");
return Ok(());
};
let Some(publish_array) = publish_item.as_array_mut() else {
anyhow::bail!("failed to cast publish item as array");
};
let Some(i) = publish_array
.iter()
.position(|x| x.as_str().map(|s| s == src_registry_name).unwrap_or(false))
else {
anyhow::bail!(
"publish key exists, but source registry name does not appear in it! (`{}`)",
publish_array.to_string()
);
};
let item_i = publish_array.get_mut(i).unwrap();
*item_i = toml_edit::Value::from(dst_registry_name);
Ok(())
}
fn prepare_source_dir_for_publish(config: &Config, meta: &mut VersionMeta) -> Result<(), Error> {
let source_dir = meta.source_dir();
let mut modified_manifest = meta
.manifest_files
.cargo_toml_orig
.parse::<toml_edit::Document>()?;
edit_deps(&mut modified_manifest, config);
edit_publish_registry(
&mut modified_manifest,
&config.src.registry_name,
&config.dst.registry_name,
)?;
// write modified manifest over Cargo.toml (leaves Cargo.toml.orig as is)
let modified_manifest_toml = modified_manifest.to_string();
let cargo_toml_path = source_dir.join("Cargo.toml");
std::fs::write(&cargo_toml_path, modified_manifest_toml.as_bytes())?;
debug!(
crate_name = %meta.index_meta.name,
vers = %meta.index_meta.vers,
path = ?cargo_toml_path,
"wrote modified manifest file",
);
meta.modified_manifest_toml = Some(modified_manifest_toml);
let cargo_toml_orig_path = source_dir.join("Cargo.toml.orig");
if cargo_toml_orig_path.exists() {
std::fs::remove_file(&cargo_toml_orig_path)?;
trace!(
crate_name = %meta.index_meta.name,
vers = %meta.index_meta.vers,
path = ?cargo_toml_orig_path,
"removed Cargo.toml.orig file",
);
}
let cargo_lock_path = source_dir.join("Cargo.lock");
if cargo_lock_path.exists() {
std::fs::remove_file(&cargo_lock_path)?;
trace!(
crate_name = %meta.index_meta.name,
vers = %meta.index_meta.vers,
path = ?cargo_lock_path,
"removed Cargo.lock file",
);
}
Ok(())
}
fn prepare_source_dirs_for_publish(
config: &Config,
manifests: &mut HashMap<String, Vec<VersionMeta>>,
) -> Result<(), Error> {
let begin = Instant::now();
manifests.par_iter_mut()
.map(|(name, versions)| -> Result<(), Error> {
for meta in versions.iter_mut() {
prepare_source_dir_for_publish(config, meta)
.map_err(|err| {
error!(%name, vers = %meta.index_meta.vers, ?err, "prepare_source_dir_for_publish failed");
err
})?;
}
Ok(())
}).collect::<Result<Vec<()>, Error>>()?;
info!("modified Cargo.toml manifests in {:?}", begin.elapsed());
Ok(())
}
fn cargo_publish_modified_source_dir(config: &Config, meta: &VersionMeta) -> Result<(), Error> {
let begin = Instant::now();
debug!(name = %meta.index_meta.name, vers = %meta.index_meta.vers, "publishing crate version");
let index_env_key = format!(
"CARGO_REGISTRIES_{}_INDEX",
config.dst.registry_name.to_case(Case::ScreamingSnake)
);
let token_env_key = format!(
"CARGO_REGISTRIES_{}_TOKEN",
config.dst.registry_name.to_case(Case::ScreamingSnake)
);
let source_dir = meta.source_dir();
let manifest_path = source_dir.join("Cargo.toml");
let manifest_path_str = manifest_path.display().to_string();
let mut args: Vec<&str> = vec!["publish"];
args.extend_from_slice(&["--registry", &config.dst.registry_name][..]);
// args.extend_from_slice(&["--index", &config.dst.index_url][..]);
args.extend_from_slice(&["--token", &config.dst.auth_token][..]);
args.extend_from_slice(&["--manifest-path", manifest_path_str.as_str()][..]);
args.extend_from_slice(&["--no-verify", "--allow-dirty", "-vv"][..]);
if config.dry_run {
warn!(
name = %meta.index_meta.name,
vers = %meta.index_meta.vers,
cmd = format!("cargo {}", args.join(" ")),
"skipping actual publish cmd: --dry-run mode",
);
return Ok(())
}
debug!(name = %meta.index_meta.name, vers = %meta.index_meta.vers, "executing `cargo {}`", args.join(" "));
let output = std::process::Command::new("cargo")
.env(&index_env_key, &config.dst.index_url)
.env(&token_env_key, &config.dst.auth_token)
.args(&args)
.output()?;
debug!(name = %meta.index_meta.name, vers = %meta.index_meta.vers, exit_status = ?output.status, "finished executing `cargo publish` command");
if !output.status.success() {
let stdout = std::str::from_utf8(&output.stdout).unwrap_or("utf8err");
let stderr = std::str::from_utf8(&output.stderr).unwrap_or("utf8err");
error!(exit_status = ?output.status, "cargo publish error!\nstdout:\n{}\nstderr:\n:{}\n\n", stdout, stderr);
if !stderr.contains("already exists") {
debug!(
"cargo publish error - original Cargo.toml:\n***\n{}\n***",
meta.manifest_files.cargo_toml_orig
);
debug!(
"cargo publish error - modified Cargo.toml:\n***\n{}\n***",
meta.modified_manifest_toml.as_ref().unwrap()
);
}
}
info!(name = %meta.index_meta.name, vers = %meta.index_meta.vers, "finished cargo publish in {:?}", begin.elapsed());
Ok(())
}
async fn verify_dir_exists<P: AsRef<std::path::Path>>(path: P) -> Result<(), Error> {
match tokio::fs::metadata(path.as_ref()).await {
Ok(meta) if meta.is_dir() => Ok(()),
Ok(meta) /* if ! meta.is_dir() */ => {
debug_assert!( ! meta.is_dir());
bail!("path exists, but is not a directory: {:?}", path.as_ref())
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
bail!("path does not exist: {}", path.as_ref().display());
}
Err(e) => Err(e.into()),
}
}
fn verify_file_exists<P: AsRef<std::path::Path>>(path: P) -> Result<(), Error> {
match std::fs::metadata(path.as_ref()) {
Ok(meta) if meta.is_file() => Ok(()),
Ok(meta) /* if ! meta.is_file() */ => {
debug_assert!( ! meta.is_file());
bail!("path exists, but is not a file: {:?}", path.as_ref())
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
bail!("path does not exist: {}", path.as_ref().display());
}
Err(e) => Err(e.into()),
}
}
fn read_publish_log_csv(path: &Path) -> Result<Vec<PublishLogRow>, Error> {
let begin = Instant::now();
let CsvSetup {
mut rdr,
headers,
mut row,
} = csv_setup(path)?;
let mut out = Vec::new();
while rdr.read_byte_record(&mut row)? {
// only partially deserialized after this
let parsed: PublishLogRow = row.deserialize(Some(&headers)).map_err(|err| {
error!(?row, ?headers, ?err, "deserializing row failed");
err
})?;
out.push(parsed);
}
info!(?path, "parsed publish log csv in {:?}", begin.elapsed());
Ok(out)
}
fn main() -> Result<(), Error> {
let begin = Instant::now();
dotenvy::dotenv().ok();
let opt = Opt::parse();
setup_logger();
let config = load_config_file(&opt)?;
let rt = tokio::runtime::Runtime::new()?;
rt.block_on(verify_dir_exists(&config.src.index_dir))?;
rt.block_on(verify_dir_exists(&config.src.crate_files_dir))?;
verify_file_exists(&config.src.publish_history_csv)?;
if opt.validate {
println!("{:#?}", config);
return Ok(());
}
let mut publish_log = read_publish_log_csv(&config.src.publish_history_csv)?;
publish_log.sort_by_key(|x| x.time);
assert!(!publish_log.is_empty());
info!(n_rows = publish_log.len(), "parsed publish log csv");
if let Some(filter) = config.compile_filter()? {
publish_log.retain(|x| filter.is_match(&x.crate_name));
info!(n_filtered_rows = publish_log.len(), "filtered publish log");
}
let krates = rt.block_on(get_index_metas(&config))?;
let mut manifests = parse_manifests(&config, krates)?;
prepare_source_dirs_for_publish(&config, &mut manifests)?;
let mut by_name_vers: HashMap<(&str, &Version), &VersionMeta> = manifests
.iter()
.flat_map(|(k, v)| v.iter().map(|m| ((k.as_str(), &m.index_meta.vers), m)))
.collect();
let skip = opt.skip.unwrap_or(0);
let limit = opt.limit.unwrap_or(publish_log.len());
debug!(skip, limit, n_total_versions = publish_log.len(), "ready to publish!");
for row in publish_log.iter().skip(skip).take(limit) {
let Some(meta) = by_name_vers.remove(&(row.crate_name.as_str(), &row.version)) else {
warn!(
?row,
"crate version in publish log not found in index versions"
);
continue;
};
if let Err(err) = cargo_publish_modified_source_dir(&config, meta) {
error!(?err, name = %meta.index_meta.name, vers = %meta.index_meta.vers, "failed to publish crate version");
}
}
info!("finished publishing crates to destination registry");
drop(manifests);
drop(rt);
info!("finished in {:?}", begin.elapsed());
Ok(())
}
Loading…
Cancel
Save