## Imports import os import sys import git import io from pathlib import Path import json import pandas as pd ## Module Constants DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z" EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" def versions(path, branch='master'): """ This function returns a generator which iterates through all commits of the repository located in the given path for the given branch. It yields file diff information to show a timeseries of file changes. """ # Create the repository, raises an error if it isn't one. repo = git.Repo(path) # Iterate through every commit for the given branch in the repository for commit in repo.iter_commits(branch): # Determine the parent of the commit to diff against. # If no parent, this is the first commit, so use empty tree. # Then create a mapping of path to diff for each file changed. parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA diffs = { diff.a_path: diff for diff in commit.diff(parent) } # The stats on the commit is a summary of all the changes for this # commit, we'll iterate through it to get the information we need. for objpath, stats in commit.stats.files.items(): # Select the diff for the path in the stats diff = diffs.get(objpath) # If the path is not in the dictionary, it's because it was # renamed, so search through the b_paths for the current name. if not diff: for diff in diffs.values(): if diff.b_path == path and diff.renamed: break p = Path(objpath) if len(p.parts) != 3: print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr) continue try: obj = commit.tree / objpath with io.BytesIO(obj.data_stream.read()) as f: lastline = list(f.readlines())[-1].decode('utf-8') except Exception as e: print(f'failed to load file at commit {commit}', file=sys.stderr) continue lastline = lastline.strip() try: d = json.loads(lastline) except Exception as e: print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr) continue row = { 'path': os.path.join(path, objpath), 'commit': commit.hexsha, 'author': commit.author.email, 'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT), #'size': diff_size(diff), #'type': diff_type(diff), 'crate_name': d['name'], 'version': d['vers'], #'json': lastline, } # Update the stats with the additional information # stats.update(row) # yield stats yield row #def diff_size(diff): # """ # Computes the size of the diff by comparing the size of the blobs. # """ # if diff.b_blob is None and diff.deleted_file: # # This is a deletion, so return negative the size of the original. # return diff.a_blob.size * -1 # # if diff.a_blob is None and diff.new_file: # # This is a new file, so return the size of the new value. # return diff.b_blob.size # # # Otherwise just return the size a-b # return diff.a_blob.size - diff.b_blob.size # # #def diff_type(diff): # """ # Determines the type of the diff by looking at the diff flags. # """ # if diff.renamed: return 'R' # if diff.deleted_file: return 'D' # if diff.new_file: return 'A' # return 'M' def main(path): df = pd.DataFrame(versions(path)) df['time'] = pd.to_datetime(df['time'], utc=True) # df['unix_nanos'] = df['time'].astype('int') df = df.sort_values(by='time').groupby(['crate_name', 'version']).last().reset_index() buf = io.StringIO() df.to_csv(buf, index=False) print(buf.getvalue()) if __name__ == '__main__': path = sys.argv[1] main(path)