From fa147a8ee40b594b32d7992366699e1f181a74e0 Mon Sep 17 00:00:00 2001 From: Jonathan Strong Date: Fri, 3 Nov 2023 16:27:21 -0400 Subject: [PATCH] easier solution (?): python script to extract which crates were published in which order from the index repo --- script/get-publish-history.py | 118 ++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 script/get-publish-history.py diff --git a/script/get-publish-history.py b/script/get-publish-history.py new file mode 100644 index 0000000..7c89bf1 --- /dev/null +++ b/script/get-publish-history.py @@ -0,0 +1,118 @@ +## Imports +import os +import sys +import git +import io +from pathlib import Path +import json +import pandas as pd + +## Module Constants +DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z" +EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" + + +def versions(path, branch='master'): + """ + This function returns a generator which iterates through all commits of + the repository located in the given path for the given branch. It yields + file diff information to show a timeseries of file changes. + """ + + # Create the repository, raises an error if it isn't one. + repo = git.Repo(path) + + # Iterate through every commit for the given branch in the repository + for commit in repo.iter_commits(branch): + # Determine the parent of the commit to diff against. + # If no parent, this is the first commit, so use empty tree. + # Then create a mapping of path to diff for each file changed. + parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA + diffs = { + diff.a_path: diff for diff in commit.diff(parent) + } + + # The stats on the commit is a summary of all the changes for this + # commit, we'll iterate through it to get the information we need. + for objpath, stats in commit.stats.files.items(): + + # Select the diff for the path in the stats + diff = diffs.get(objpath) + + # If the path is not in the dictionary, it's because it was + # renamed, so search through the b_paths for the current name. + if not diff: + for diff in diffs.values(): + if diff.b_path == path and diff.renamed: + break + + p = Path(objpath) + if len(p.parts) != 3: + print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr) + continue + try: + obj = commit.tree / objpath + with io.BytesIO(obj.data_stream.read()) as f: + lastline = list(f.readlines())[-1].decode('utf-8') + except Exception as e: + print(f'failed to load file at commit {commit}', file=sys.stderr) + continue + + lastline = lastline.strip() + try: + d = json.loads(lastline) + except Exception as e: + print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr) + continue + + row = { + 'path': os.path.join(path, objpath), + 'commit': commit.hexsha, + 'author': commit.author.email, + 'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT), + #'size': diff_size(diff), + #'type': diff_type(diff), + 'crate': d['name'], + 'vers': d['vers'], + #'json': lastline, + } + + # Update the stats with the additional information + # stats.update(row) + # yield stats + + yield row + + +def diff_size(diff): + """ + Computes the size of the diff by comparing the size of the blobs. + """ + if diff.b_blob is None and diff.deleted_file: + # This is a deletion, so return negative the size of the original. + return diff.a_blob.size * -1 + + if diff.a_blob is None and diff.new_file: + # This is a new file, so return the size of the new value. + return diff.b_blob.size + + # Otherwise just return the size a-b + return diff.a_blob.size - diff.b_blob.size + + +def diff_type(diff): + """ + Determines the type of the diff by looking at the diff flags. + """ + if diff.renamed: return 'R' + if diff.deleted_file: return 'D' + if diff.new_file: return 'A' + return 'M' + +df = pd.DataFrame(versions('crate-index')) +df['time'] = pd.to_datetime(df['time']) +df['unix_nanos'] = df['time'].astype('int') +df = df.sort_values(by='time').groupby(['crate', 'vers']).last().reset_index() +buf = io.StringIO() +df.to_csv(buf, index=False) +print(buf.getvalue())