## Imports
import os
import sys
import git
import io
from pathlib import Path
import json
import pandas as pd

## Module Constants
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"
EMPTY_TREE_SHA   = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"


def versions(path, branch='master'):
    """
    This function returns a generator which iterates through all commits of
    the repository located in the given path for the given branch. It yields
    file diff information to show a timeseries of file changes.
    """

    # Create the repository, raises an error if it isn't one.
    repo = git.Repo(path)

    # Iterate through every commit for the given branch in the repository
    for commit in repo.iter_commits(branch):
        # Determine the parent of the commit to diff against.
        # If no parent, this is the first commit, so use empty tree.
        # Then create a mapping of path to diff for each file changed.
        parent = commit.parents[0] if commit.parents else EMPTY_TREE_SHA
        diffs  = {
            diff.a_path: diff for diff in commit.diff(parent)
        }

        # The stats on the commit is a summary of all the changes for this
        # commit, we'll iterate through it to get the information we need.
        for objpath, stats in commit.stats.files.items():

            # Select the diff for the path in the stats
            diff = diffs.get(objpath)

            # If the path is not in the dictionary, it's because it was
            # renamed, so search through the b_paths for the current name.
            if not diff:
                for diff in diffs.values():
                    if diff.b_path == path and diff.renamed:
                        break

            p = Path(objpath)
            if len(p.parts) != 3:
                print(f'skipping path: wrong depth ({p.parts})', file=sys.stderr)
                continue
            try:
                obj = commit.tree / objpath
                with io.BytesIO(obj.data_stream.read()) as f:
                    lastline = list(f.readlines())[-1].decode('utf-8')
            except Exception as e:
                print(f'failed to load file at commit {commit}', file=sys.stderr)
                continue

            lastline = lastline.strip()
            try:
                d = json.loads(lastline)
            except Exception as e:
                print(f'failed to parse json at commit {commit}: {e}', file=sys.stderr)
                continue

            row = {
                'path': os.path.join(path, objpath),
                'commit': commit.hexsha,
                'author': commit.author.email,
                'time': commit.authored_datetime.strftime(DATE_TIME_FORMAT),
                #'size': diff_size(diff),
                #'type': diff_type(diff),
                'crate_name': d['name'],
                'version': d['vers'],
                #'json': lastline,
            }

            # Update the stats with the additional information
            # stats.update(row)
            # yield stats

            yield row


#def diff_size(diff):
#    """
#    Computes the size of the diff by comparing the size of the blobs.
#    """
#    if diff.b_blob is None and diff.deleted_file:
#        # This is a deletion, so return negative the size of the original.
#        return diff.a_blob.size * -1
#
#    if diff.a_blob is None and diff.new_file:
#        # This is a new file, so return the size of the new value.
#        return diff.b_blob.size
#
#    # Otherwise just return the size a-b
#    return diff.a_blob.size - diff.b_blob.size
#
#
#def diff_type(diff):
#    """
#    Determines the type of the diff by looking at the diff flags.
#    """
#    if diff.renamed: return 'R'
#    if diff.deleted_file: return 'D'
#    if diff.new_file: return 'A'
#    return 'M'

def main(path):
    df = pd.DataFrame(versions(path))
    df['time'] = pd.to_datetime(df['time'], utc=True)
    # df['unix_nanos'] = df['time'].astype('int')
    df = df.sort_values(by='time').groupby(['crate_name', 'version']).last().reset_index()
    buf = io.StringIO()
    df.to_csv(buf, index=False)
    print(buf.getvalue())

if __name__ == '__main__':
    path = sys.argv[1]
    main(path)