From d740be6709ab7ac6a1b271430d650e4381f3f761 Mon Sep 17 00:00:00 2001 From: Brian Harring Date: Tue, 16 Oct 2012 00:21:07 -0700 Subject: refactoring; bypass the commit creation in each repo and linearization by git; handle it ourselves --- create-git.sh | 26 ++++++++++-------------- process_directory.sh | 10 ++++++---- rewrite-commit-dump.py | 54 ++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 58 insertions(+), 32 deletions(-) diff --git a/create-git.sh b/create-git.sh index 6389024..667fed0 100755 --- a/create-git.sh +++ b/create-git.sh @@ -13,28 +13,24 @@ git config core.logAllRefUpdates false git config prune.expire now mkdir -p objects/info targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \ - xargs -n1 readlink -f | tee >(sed -e 's:$:/git/objects:' > objects/info/alternates) ) ) -for x in "${targets[@]}"; do - rev=$(git --git-dir $x/git rev-list -1 master 2> /dev/null) - [ -z "$rev" ] && { echo "no content: $x"; continue; } - x="refs/heads/source/$(basename $x)" - git update-ref "$x" $rev -done - -echo "linearizing history, and rewriting messages..." + xargs -n1 readlink -f | \ + while read l; do + [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || continue; + echo "$l/git/objects" >> objects/info/alternates + echo "$l" + done + ) +) +echo "loading all commits, linearizing, and rewriting history..." time ( - git fast-export --progress=1000 --all --reverse --date-order --no-data | \ - tee ../export-stream-raw | \ - "${root}/rewrite-commit-dump.py" | \ + "${root}/rewrite-commit-dump.py" "${targets[@]}" | \ tee ../export-stream-rewritten | \ git fast-import ) 2>&1 | tee git-creation.log echo "recomposed; repacking and breaking alternate linkage..." -# Wipe the strong refs to the other repos... -git ls-remote . refs/heads/source/'*' | awk '{print $2;}' | xargs -n1 git update-ref -d -# Localize the content... +# Localize the content we actual use out of the alternates... time git repack -Adf --window=100 --depth=100 # Wipe the alternates. rm objects/info/alternates diff --git a/process_directory.sh b/process_directory.sh index 14ef28c..a7be6ed 100755 --- a/process_directory.sh +++ b/process_directory.sh @@ -16,10 +16,12 @@ f() { time cvs2git --options config -vv cd git git init --bare - { "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat; - cat ../cvs2svn-tmp/git-dump.dat; - } | git fast-import - rm -rf "${final}" git-work + # Note we're only pull in blob data here; this intentional- we need to + # interlace the commit objects together, these git object pools will be + # be used as alternates for the final repo combination. + "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat | \ + git fast-import --export-marks=../cvs2svn-tmp/git-blob.idx + rm -rf "${final}" cd "$root" mv "$output" "${final}" set +x diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py index 7678406..f657a8e 100755 --- a/rewrite-commit-dump.py +++ b/rewrite-commit-dump.py @@ -1,5 +1,7 @@ #!/usr/bin/python import functools +import operator +import os import re import sys from collections import namedtuple @@ -12,10 +14,10 @@ mangler.append(functools.partial( re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub, r"Package-Manager: portage-\1")) -fields = ('mark', 'author', 'committer', 'msg', 'files') +fields = ('author', 'committer', 'msg', 'files', 'timestamp') record = namedtuple('record', fields) -def deserialize_records(source): +def deserialize_records(source, blob_idx): line = source.readline() while line: while line.split()[0] in ('reset', 'progress'): @@ -28,9 +30,9 @@ def deserialize_records(source): line = source.readline() chunks = line.split(None, 1) assert len(chunks) == 2, line - if chunks[0] == 'from': + if chunks[0] in ('from', 'mark'): continue - assert chunks[0] in ('mark', 'author', 'committer', 'data') + assert chunks[0] in ('author', 'committer', 'data') if chunks[0] != 'data': d[chunks[0]] = chunks[1].strip() continue @@ -63,28 +65,39 @@ def deserialize_records(source): files[mode[1]] = (mode[0], line) elif mode[0] == 'M': # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog - chunks = mode[1].split(None, 3) - assert len(chunks) == 3, line - files[chunks[2]] = (mode[0], line) + # if it's not a sha1, but startswith ':'... then it's an index. + chunks = line.split(None, 4) + assert len(chunks) == 4, line + fname = chunks[3] + if chunks[2][0] == ':': + line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname]) + files[fname] = (mode[0], line) else: raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line)) line = source.readline() d['files'] = files # Basic sanity check for the code above... assert set(fields).issuperset(d), d + d.setdefault('author', d.get('committer')) + assert d['author'] is not None + # Skank the timestamp out... + chunks = d['author'].rsplit(None, 1) + assert len(chunks) == 2 and chunks[1] == '+0000', d['author'] + d['timestamp'] = long(chunks[0].rsplit(None, 1)[1]) yield record(*[d.get(x) for x in fields]) # Bleh... of course namedtuple doesn't make this easy. line = source.readline() -def serialize_records(records, handle, target='refs/heads/master', progress=1000): +def serialize_records(records, handle, target='refs/heads/master', progress=5000): write = handle.write write('reset %s\n' % target) total = len(records) for idx, record in enumerate(records, 1): if idx % progress == 0: write('progress %02.1f%%: %i of %i commits\n' - % ((100 * float(idx))//total, idx, total)) + % ((100 * float(idx))/total, idx, total)) write('commit %s\n' % target) + write('mark :%i\n' % idx) # fields = ('mark', 'author', 'committer', 'msg', 'files') for name, value in zip(fields, record): if name == 'files': @@ -94,17 +107,32 @@ def serialize_records(records, handle, target='refs/heads/master', progress=1000 write("%s %s\n" % (name, value)) elif name == 'msg': write("data %i\n%s" % (len(value), value)) + elif name == 'timestamp': + continue else: raise AssertionError("serialize is out of sync; don't know field %s" % name) write("\n") +def deserialize_blob_map(source): + source = (x.strip().split() for x in source) + return dict((int(x[0].lstrip(':')), x[1]) for x in source) + def main(argv): - source = open(argv[0], 'r') if argv else sys.stdin - records = list(deserialize_records(source)) + records = [] + source = argv if argv else sys.stdin + directories = [x.strip() for x in source] + for directory in directories: + tmp = os.path.join(directory, 'cvs2svn-tmp') + commits = os.path.join(tmp, 'git-dump.dat') + if not os.path.exists(commits): + sys.stderr.write("skipping %s; no commit data\n" % directory) + continue + blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx'))) + records.extend(deserialize_records(open(commits, 'r'), blob_index)) + records.sort(key=operator.attrgetter('timestamp')) + #records = list(deserialize_records(source)) serialize_records(records, sys.stdout) return 0 if __name__ == '__main__': - if len(sys.argv) not in (1, 2): - raise SystemExit("args must be either none, or path to fast-export stream to read", code=1) sys.exit(main(sys.argv[1:])) -- cgit v1.2.3-65-gdbad