Jonathan Strong
1 year ago
1 changed files with 118 additions and 0 deletions
@ -0,0 +1,118 @@
|
||||
## Imports |
||||
import os |
||||
import sys |
||||
import git |
||||
import io |
||||
from pathlib import Path |
||||
import json |
||||
import pandas as pd |
||||
|
||||
## Module Constants |
||||
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z" |
||||
EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" |
||||
|
||||
|
||||
def versions(path, branch='master'): |
||||
""" |
||||
This function returns a generator which iterates through all commits of |
||||
the repository located in the given path for the given branch. It yields |
||||
file diff information to show a timeseries of file changes. |
||||
""" |
||||
|
||||
# Create the repository, raises an error if it isn't one. |
||||
repo = git.Repo(path) |
||||
|
||||
# Iterate through every commit for the given branch in the repository |
||||
for commit in repo.iter_commits(branch): |
||||
# Determine the parent of the commit to diff against. |
||||
# If no parent, this is the first commit, so use empty tree. |
||||
# Then create a mapping of path to diff for each file changed. |
||||
parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA |
||||
diffs = { |
||||
diff.a_path: diff for diff in commit.diff(parent) |
||||
} |
||||
|
||||
# The stats on the commit is a summary of all the changes for this |
||||
# commit, we'll iterate through it to get the information we need. |
||||
for objpath, stats in commit.stats.files.items(): |
||||
|
||||
# Select the diff for the path in the stats |
||||
diff = diffs.get(objpath) |
||||
|
||||
# If the path is not in the dictionary, it's because it was |
||||
# renamed, so search through the b_paths for the current name. |
||||
if not diff: |
||||
for diff in diffs.values(): |
||||
if diff.b_path == path and diff.renamed: |
||||
break |
||||
|
||||
p = Path(objpath) |
||||
if len(p.parts) != 3: |
||||
print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr) |
||||
continue |
||||
try: |
||||
obj = commit.tree / objpath |
||||
with io.BytesIO(obj.data_stream.read()) as f: |
||||
lastline = list(f.readlines())[-1].decode('utf-8') |
||||
except Exception as e: |
||||
print(f'failed to load file at commit {commit}', file=sys.stderr) |
||||
continue |
||||
|
||||
lastline = lastline.strip() |
||||
try: |
||||
d = json.loads(lastline) |
||||
except Exception as e: |
||||
print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr) |
||||
continue |
||||
|
||||
row = { |
||||
'path': os.path.join(path, objpath), |
||||
'commit': commit.hexsha, |
||||
'author': commit.author.email, |
||||
'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT), |
||||
#'size': diff_size(diff), |
||||
#'type': diff_type(diff), |
||||
'crate': d['name'], |
||||
'vers': d['vers'], |
||||
#'json': lastline, |
||||
} |
||||
|
||||
# Update the stats with the additional information |
||||
# stats.update(row) |
||||
# yield stats |
||||
|
||||
yield row |
||||
|
||||
|
||||
def diff_size(diff): |
||||
""" |
||||
Computes the size of the diff by comparing the size of the blobs. |
||||
""" |
||||
if diff.b_blob is None and diff.deleted_file: |
||||
# This is a deletion, so return negative the size of the original. |
||||
return diff.a_blob.size * -1 |
||||
|
||||
if diff.a_blob is None and diff.new_file: |
||||
# This is a new file, so return the size of the new value. |
||||
return diff.b_blob.size |
||||
|
||||
# Otherwise just return the size a-b |
||||
return diff.a_blob.size - diff.b_blob.size |
||||
|
||||
|
||||
def diff_type(diff): |
||||
""" |
||||
Determines the type of the diff by looking at the diff flags. |
||||
""" |
||||
if diff.renamed: return 'R' |
||||
if diff.deleted_file: return 'D' |
||||
if diff.new_file: return 'A' |
||||
return 'M' |
||||
|
||||
df = pd.DataFrame(versions('crate-index')) |
||||
df['time'] = pd.to_datetime(df['time']) |
||||
df['unix_nanos'] = df['time'].astype('int') |
||||
df = df.sort_values(by='time').groupby(['crate', 'vers']).last().reset_index() |
||||
buf = io.StringIO() |
||||
df.to_csv(buf, index=False) |
||||
print(buf.getvalue()) |
Loading…
Reference in new issue