From d740be6709ab7ac6a1b271430d650e4381f3f761 Mon Sep 17 00:00:00 2001
From: Brian Harring <ferringb@google.com>
Date: Tue, 16 Oct 2012 00:21:07 -0700
Subject: refactoring; bypass the commit creation in each repo and
 linearization by git; handle it ourselves

---
 create-git.sh          | 26 ++++++++++--------------
 process_directory.sh   | 10 ++++++----
 rewrite-commit-dump.py | 54 ++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/create-git.sh b/create-git.sh
index 6389024..667fed0 100755
--- a/create-git.sh
+++ b/create-git.sh
@@ -13,28 +13,24 @@ git config core.logAllRefUpdates false
 git config prune.expire now
 mkdir -p objects/info
 targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
-    xargs -n1 readlink -f | tee >(sed -e 's:$:/git/objects:' > objects/info/alternates) ) )
-for x in "${targets[@]}"; do
-  rev=$(git --git-dir $x/git rev-list -1 master 2> /dev/null)
-  [ -z "$rev" ] && { echo "no content: $x"; continue; }
-  x="refs/heads/source/$(basename $x)"
-  git update-ref "$x" $rev
-done
-
-echo "linearizing history, and rewriting messages..."
+  xargs -n1 readlink -f | \
+    while read l; do
+      [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || continue;
+      echo "$l/git/objects" >> objects/info/alternates
+      echo "$l"
+    done
+  )
+)
 
+echo "loading all commits, linearizing, and rewriting history..."
 time (
-  git fast-export --progress=1000 --all --reverse --date-order --no-data | \
-    tee ../export-stream-raw | \
-    "${root}/rewrite-commit-dump.py" | \
+  "${root}/rewrite-commit-dump.py" "${targets[@]}" | \
     tee ../export-stream-rewritten | \
     git fast-import
 ) 2>&1 | tee git-creation.log
 
 echo "recomposed; repacking and breaking alternate linkage..."
-# Wipe the strong refs to the other repos...
-git ls-remote . refs/heads/source/'*' | awk '{print $2;}' | xargs -n1 git update-ref -d
-# Localize the content...
+# Localize the content we actual use out of the alternates...
 time git repack -Adf --window=100 --depth=100
 # Wipe the alternates.
 rm objects/info/alternates
diff --git a/process_directory.sh b/process_directory.sh
index 14ef28c..a7be6ed 100755
--- a/process_directory.sh
+++ b/process_directory.sh
@@ -16,10 +16,12 @@ f() {
   time cvs2git --options config -vv
   cd git
   git init --bare
-  { "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat;
-    cat ../cvs2svn-tmp/git-dump.dat;
-  } | git fast-import
-  rm -rf "${final}" git-work
+  # Note we're only pull in blob data here; this intentional- we need to
+  # interlace the commit objects together, these git object pools will be
+  # be used as alternates for the final repo combination.
+  "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat | \
+    git fast-import --export-marks=../cvs2svn-tmp/git-blob.idx
+  rm -rf "${final}"
   cd "$root"
   mv "$output" "${final}"
   set +x
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 7678406..f657a8e 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,5 +1,7 @@
 #!/usr/bin/python
 import functools
+import operator
+import os
 import re
 import sys
 from collections import namedtuple
@@ -12,10 +14,10 @@ mangler.append(functools.partial(
   re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
     r"Package-Manager: portage-\1"))
 
-fields = ('mark', 'author', 'committer', 'msg', 'files')
+fields = ('author', 'committer', 'msg', 'files', 'timestamp')
 record = namedtuple('record', fields)
 
-def deserialize_records(source):
+def deserialize_records(source, blob_idx):
   line = source.readline()
   while line:
     while line.split()[0] in ('reset', 'progress'):
@@ -28,9 +30,9 @@ def deserialize_records(source):
       line = source.readline()
       chunks = line.split(None, 1)
       assert len(chunks) == 2, line
-      if chunks[0] == 'from':
+      if chunks[0] in ('from', 'mark'):
         continue
-      assert chunks[0] in ('mark', 'author', 'committer', 'data')
+      assert chunks[0] in ('author', 'committer', 'data')
       if chunks[0] != 'data':
         d[chunks[0]] = chunks[1].strip()
         continue
@@ -63,28 +65,39 @@ def deserialize_records(source):
         files[mode[1]] = (mode[0], line)
       elif mode[0] == 'M':
         # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
-        chunks = mode[1].split(None, 3)
-        assert len(chunks) == 3, line
-        files[chunks[2]] = (mode[0], line)
+        # if it's not a sha1, but startswith ':'... then it's an index.
+        chunks = line.split(None, 4)
+        assert len(chunks) == 4, line
+        fname = chunks[3]
+        if chunks[2][0] == ':':
+          line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname])
+        files[fname] = (mode[0], line)
       else:
         raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line))
       line = source.readline()
     d['files'] = files
     # Basic sanity check for the code above...
     assert set(fields).issuperset(d), d
+    d.setdefault('author', d.get('committer'))
+    assert d['author'] is not None
+    # Skank the timestamp out...
+    chunks = d['author'].rsplit(None, 1)
+    assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
+    d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
     yield record(*[d.get(x) for x in fields])
     # Bleh... of course namedtuple doesn't make this easy.
     line = source.readline()
 
-def serialize_records(records, handle, target='refs/heads/master', progress=1000):
+def serialize_records(records, handle, target='refs/heads/master', progress=5000):
   write = handle.write
   write('reset %s\n' % target)
   total = len(records)
   for idx, record in enumerate(records, 1):
     if idx % progress == 0:
       write('progress %02.1f%%: %i of %i commits\n'
-        % ((100 * float(idx))//total, idx, total))
+        % ((100 * float(idx))/total, idx, total))
     write('commit %s\n' % target)
+    write('mark :%i\n' % idx)
     # fields = ('mark', 'author', 'committer', 'msg', 'files')
     for name, value in zip(fields, record):
       if name == 'files':
@@ -94,17 +107,32 @@ def serialize_records(records, handle, target='refs/heads/master', progress=1000
         write("%s %s\n" % (name, value))
       elif name == 'msg':
         write("data %i\n%s" % (len(value), value))
+      elif name == 'timestamp':
+        continue
       else:
         raise AssertionError("serialize is out of sync; don't know field %s" % name)
     write("\n")
 
+def deserialize_blob_map(source):
+  source = (x.strip().split() for x in source)
+  return dict((int(x[0].lstrip(':')), x[1]) for x in source)
+
 def main(argv):
-  source = open(argv[0], 'r') if argv else sys.stdin
-  records = list(deserialize_records(source))
+  records = []
+  source = argv if argv else sys.stdin
+  directories = [x.strip() for x in source]
+  for directory in directories:
+    tmp = os.path.join(directory, 'cvs2svn-tmp')
+    commits = os.path.join(tmp, 'git-dump.dat')
+    if not os.path.exists(commits):
+      sys.stderr.write("skipping %s; no commit data\n" % directory)
+      continue
+    blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
+    records.extend(deserialize_records(open(commits, 'r'), blob_index))
+  records.sort(key=operator.attrgetter('timestamp'))
+  #records = list(deserialize_records(source))
   serialize_records(records, sys.stdout)
   return 0
 
 if __name__ == '__main__':
-  if len(sys.argv) not in (1, 2):
-    raise SystemExit("args must be either none, or path to fast-export stream to read", code=1)
   sys.exit(main(sys.argv[1:]))
-- 
cgit v1.2.3-65-gdbad