registry-backup/script/get-publish-history.py

## Imports
import os
import sys
import git
import io
from pathlib import Path
import json
import pandas as pd

## Module Constants
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
EMPTY_TREE_SHA   = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"


def versions(path, branch='master'):
    """
    This function returns a generator which iterates through all commits of
    the repository located in the given path for the given branch. It yields
    file diff information to show a timeseries of file changes.
    """

    # Create the repository, raises an error if it isn't one.
    repo = git.Repo(path)

    # Iterate through every commit for the given branch in the repository
    for commit in repo.iter_commits(branch):
        # Determine the parent of the commit to diff against.
        # If no parent, this is the first commit, so use empty tree.
        # Then create a mapping of path to diff for each file changed.
        parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA
        diffs  = {
            diff.a_path: diff for diff in commit.diff(parent)
        }

        # The stats on the commit is a summary of all the changes for this
        # commit, we'll iterate through it to get the information we need.
        for objpath, stats in commit.stats.files.items():

            # Select the diff for the path in the stats
            diff = diffs.get(objpath)

            # If the path is not in the dictionary, it's because it was
            # renamed, so search through the b_paths for the current name.
            if not diff:
                for diff in diffs.values():
                    if diff.b_path == path and diff.renamed:
                        break

            p = Path(objpath)
            if len(p.parts) != 3:
                print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr)
                continue
            try:
                obj = commit.tree / objpath
                with io.BytesIO(obj.data_stream.read()) as f:
                    lastline = list(f.readlines())[-1].decode('utf-8')
            except Exception as e:
                print(f'failed to load file at commit {commit}', file=sys.stderr)
                continue

            lastline = lastline.strip()
            try:
                d = json.loads(lastline)
            except Exception as e:
                print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr)
                continue

            row = {
                'path': os.path.join(path, objpath),
                'commit': commit.hexsha,
                'author': commit.author.email,
                'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT),
                #'size': diff_size(diff),
                #'type': diff_type(diff),
                'crate': d['name'],
                'vers': d['vers'],
                #'json': lastline,
            }

            # Update the stats with the additional information
            # stats.update(row)
            # yield stats

            yield row


def diff_size(diff):
    """
    Computes the size of the diff by comparing the size of the blobs.
    """
    if diff.b_blob is None and diff.deleted_file:
        # This is a deletion, so return negative the size of the original.
        return diff.a_blob.size * -1

    if diff.a_blob is None and diff.new_file:
        # This is a new file, so return the size of the new value.
        return diff.b_blob.size

    # Otherwise just return the size a-b
    return diff.a_blob.size - diff.b_blob.size


def diff_type(diff):
    """
    Determines the type of the diff by looking at the diff flags.
    """
    if diff.renamed: return 'R'
    if diff.deleted_file: return 'D'
    if diff.new_file: return 'A'
    return 'M'

df = pd.DataFrame(versions('crate-index'))
df['time'] = pd.to_datetime(df['time'])
df['unix_nanos'] = df['time'].astype('int')
df = df.sort_values(by='time').groupby(['crate', 'vers']).last().reset_index()
buf = io.StringIO()
df.to_csv(buf, index=False)
print(buf.getvalue())
easier solution (?): python script to extract which crates were published in which order from the index repo 1 year ago			`## Imports`
			`import os`
			`import sys`
			`import git`
			`import io`
			`from pathlib import Path`
			`import json`
			`import pandas as pd`

			`## Module Constants`
			`DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"`
			`EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"`


			`def versions(path, branch='master'):`
			`"""`
			`This function returns a generator which iterates through all commits of`
			`the repository located in the given path for the given branch. It yields`
			`file diff information to show a timeseries of file changes.`
			`"""`

			`# Create the repository, raises an error if it isn't one.`
			`repo = git.Repo(path)`

			`# Iterate through every commit for the given branch in the repository`
			`for commit in repo.iter_commits(branch):`
			`# Determine the parent of the commit to diff against.`
			`# If no parent, this is the first commit, so use empty tree.`
			`# Then create a mapping of path to diff for each file changed.`
			`parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA`
			`diffs = {`
			`diff.a_path: diff for diff in commit.diff(parent)`
			`}`

			`# The stats on the commit is a summary of all the changes for this`
			`# commit, we'll iterate through it to get the information we need.`
			`for objpath, stats in commit.stats.files.items():`

			`# Select the diff for the path in the stats`
			`diff = diffs.get(objpath)`

			`# If the path is not in the dictionary, it's because it was`
			`# renamed, so search through the b_paths for the current name.`
			`if not diff:`
			`for diff in diffs.values():`
			`if diff.b_path == path and diff.renamed:`
			`break`

			`p = Path(objpath)`
			`if len(p.parts) != 3:`
			`print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr)`
			`continue`
			`try:`
			`obj = commit.tree / objpath`
			`with io.BytesIO(obj.data_stream.read()) as f:`
			`lastline = list(f.readlines())[-1].decode('utf-8')`
			`except Exception as e:`
			`print(f'failed to load file at commit {commit}', file=sys.stderr)`
			`continue`

			`lastline = lastline.strip()`
			`try:`
			`d = json.loads(lastline)`
			`except Exception as e:`
			`print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr)`
			`continue`

			`row = {`
			`'path': os.path.join(path, objpath),`
			`'commit': commit.hexsha,`
			`'author': commit.author.email,`
			`'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT),`
			`#'size': diff_size(diff),`
			`#'type': diff_type(diff),`
			`'crate': d['name'],`
			`'vers': d['vers'],`
			`#'json': lastline,`
			`}`

			`# Update the stats with the additional information`
			`# stats.update(row)`
			`# yield stats`

			`yield row`


			`def diff_size(diff):`
			`"""`
			`Computes the size of the diff by comparing the size of the blobs.`
			`"""`
			`if diff.b_blob is None and diff.deleted_file:`
			`# This is a deletion, so return negative the size of the original.`
			`return diff.a_blob.size * -1`

			`if diff.a_blob is None and diff.new_file:`
			`# This is a new file, so return the size of the new value.`
			`return diff.b_blob.size`

			`# Otherwise just return the size a-b`
			`return diff.a_blob.size - diff.b_blob.size`


			`def diff_type(diff):`
			`"""`
			`Determines the type of the diff by looking at the diff flags.`
			`"""`
			`if diff.renamed: return 'R'`
			`if diff.deleted_file: return 'D'`
			`if diff.new_file: return 'A'`
			`return 'M'`

			`df = pd.DataFrame(versions('crate-index'))`
			`df['time'] = pd.to_datetime(df['time'])`
			`df['unix_nanos'] = df['time'].astype('int')`
			`df = df.sort_values(by='time').groupby(['crate', 'vers']).last().reset_index()`
			`buf = io.StringIO()`
			`df.to_csv(buf, index=False)`
			`print(buf.getvalue())`