easier solution (?): python script to extract which crates were published in which order from the index repo

3 years ago · fa147a8ee4
1 changed files with 118 additions and 0 deletions
--- a/script/get-publish-history.py
+++ b/script/get-publish-history.py
@ -0,0 +1,118 @@
+## Imports
+import os
+import sys
+import git
+import io
+from pathlib import Path
+import json
+import pandas as pd
+
+## Module Constants
+DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
+EMPTY_TREE_SHA   = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
+
+
+def versions(path, branch='master'):
+    """
+    This function returns a generator which iterates through all commits of
+    the repository located in the given path for the given branch. It yields
+    file diff information to show a timeseries of file changes.
+    """
+
+    # Create the repository, raises an error if it isn't one.
+    repo = git.Repo(path)
+
+    # Iterate through every commit for the given branch in the repository
+    for commit in repo.iter_commits(branch):
+        # Determine the parent of the commit to diff against.
+        # If no parent, this is the first commit, so use empty tree.
+        # Then create a mapping of path to diff for each file changed.
+        parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA
+        diffs  = {
+            diff.a_path: diff for diff in commit.diff(parent)
+        }
+
+        # The stats on the commit is a summary of all the changes for this
+        # commit, we'll iterate through it to get the information we need.
+        for objpath, stats in commit.stats.files.items():
+
+            # Select the diff for the path in the stats
+            diff = diffs.get(objpath)
+
+            # If the path is not in the dictionary, it's because it was
+            # renamed, so search through the b_paths for the current name.
+            if not diff:
+                for diff in diffs.values():
+                    if diff.b_path == path and diff.renamed:
+                        break
+
+            p = Path(objpath)
+            if len(p.parts) != 3:
+                print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr)
+                continue
+            try:
+                obj = commit.tree / objpath
+                with io.BytesIO(obj.data_stream.read()) as f:
+                    lastline = list(f.readlines())[-1].decode('utf-8')
+            except Exception as e:
+                print(f'failed to load file at commit {commit}', file=sys.stderr)
+                continue
+
+            lastline = lastline.strip()
+            try:
+                d = json.loads(lastline)
+            except Exception as e:
+                print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr)
+                continue
+
+            row = {
+                'path': os.path.join(path, objpath),
+                'commit': commit.hexsha,
+                'author': commit.author.email,
+                'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT),
+                #'size': diff_size(diff),
+                #'type': diff_type(diff),
+                'crate': d['name'],
+                'vers': d['vers'],
+                #'json': lastline,
+            }
+
+            # Update the stats with the additional information
+            # stats.update(row)
+            # yield stats
+
+            yield row
+
+
+def diff_size(diff):
+    """
+    Computes the size of the diff by comparing the size of the blobs.
+    """
+    if diff.b_blob is None and diff.deleted_file:
+        # This is a deletion, so return negative the size of the original.
+        return diff.a_blob.size * -1
+
+    if diff.a_blob is None and diff.new_file:
+        # This is a new file, so return the size of the new value.
+        return diff.b_blob.size
+
+    # Otherwise just return the size a-b
+    return diff.a_blob.size - diff.b_blob.size
+
+
+def diff_type(diff):
+    """
+    Determines the type of the diff by looking at the diff flags.
+    """
+    if diff.renamed: return 'R'
+    if diff.deleted_file: return 'D'
+    if diff.new_file: return 'A'
+    return 'M'
+
+df = pd.DataFrame(versions('crate-index'))
+df['time'] = pd.to_datetime(df['time'])
+df['unix_nanos'] = df['time'].astype('int')
+df = df.sort_values(by='time').groupby(['crate', 'vers']).last().reset_index()
+buf = io.StringIO()
+df.to_csv(buf, index=False)
+print(buf.getvalue())