rewrite-commit-dump.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

#!/usr/bin/python
import functools
import itertools
import operator
import os
import re
import sys
from collections import namedtuple

mangler = []
mangler.append(functools.partial(
  re.compile(r"^\(paludis (0.1.*)\)$", re.M|re.I).sub,
    r"Package-Manager: paludis-\1/"))
mangler.append(functools.partial(
  re.compile(r'^\(portage version: *([^,\n)]*), +unsigned Manifest commit\)$', re.M|re.I).sub,
    r'Package-Manager: portage-\1'))
mangler.append(functools.partial(
  re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
    r"Package-Manager: portage-\1"))

fields = ('author', 'committer', 'msg', 'files', 'timestamp')
fields_map = dict((attr, idx) for idx, attr in enumerate(fields))
file_idx = fields_map['files']
class record(namedtuple('record', fields)):
  def safe_combine(self, other, file_idx=fields_map['files']):
    files = self.files.copy()
    assert not set(files).intersection(other.files), (files, other.files)
    files.update(other.files)
    items = list(self)
    items[file_idx] = files
    return self.__class__(*items)

def deserialize_records(source, blob_idx):
  line = source.readline()
  while line:
    while line.split()[0] in ('reset', 'progress'):
      line = source.readline()

    # First get the free form fields; stop after we get the commit msg.
    assert line.split()[0] == 'commit', line
    d = {}
    while True:
      line = source.readline()
      chunks = line.split(None, 1)
      assert len(chunks) == 2, line
      if chunks[0] in ('from', 'mark'):
        continue
      assert chunks[0] in ('author', 'committer', 'data')
      if chunks[0] != 'data':
        d[chunks[0]] = chunks[1].strip()
        continue
      # Process the commit message...
      size = int(chunks[1])
      data = source.read(size)
      assert len(data) == size, (line, data)
      for func in mangler:
        data = func(data)
      d['msg'] = data
      line = source.readline()
      # Note that cvs2git writes slightly funky data statements; the byte count
      # doesn't necessarily include the trailing newline.
      if line == '\n':
        line = source.readline()
      break

    assert line
    # From can show up here on occasion... annoying.
    if line.split()[0:1] == ['from']:
      line = source.readline()
    files = {}
    while line != '\n':
      # Two types I can spot; M=modify, and D=delete.
      assert line[-1] == '\n'
      line = line[:-1]
      mode = line.split(None, 1)
      assert len(mode) == 2, line
      if mode[0] == 'D':
        files[mode[1]] = (mode[0], line)
      elif mode[0] == 'M':
        # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
        # if it's not a sha1, but startswith ':'... then it's an index.
        chunks = line.split(None, 4)
        assert len(chunks) == 4, line
        fname = chunks[3]
        if chunks[2][0] == ':':
          line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname])
        files[fname] = (mode[0], line)
      else:
        raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line))
      line = source.readline()
    d['files'] = files
    # Basic sanity check for the code above...
    assert set(fields).issuperset(d), d
    d.setdefault('author', d.get('committer'))
    assert d['author'] is not None
    # Skank the timestamp out...
    chunks = d['author'].rsplit(None, 1)
    assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
    d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
    yield record(*[d.get(x) for x in fields])
    # Bleh... of course namedtuple doesn't make this easy.
    line = source.readline()

def serialize_records(records, handle, target='refs/heads/master', progress=5000):
  write = handle.write
  write('reset %s\n' % target)
  total = len(records)
  for idx, record in enumerate(records, 1):
    if idx % progress == 0:
      write('progress %02.1f%%: %i of %i commits\n'
        % ((100 * float(idx))/total, idx, total))
    write('commit %s\n' % target)
    write('mark :%i\n' % idx)
    # fields = ('mark', 'author', 'committer', 'msg', 'files')
    for name, value in zip(fields, record):
      if name == 'files':
        for filename in sorted(value):
          write("%s\n" % (value[filename][1],))
      elif name in ('mark', 'author', 'committer'):
        write("%s %s\n" % (name, value))
      elif name == 'msg':
        write("data %i\n%s" % (len(value), value))
      elif name == 'timestamp':
        continue
      else:
        raise AssertionError("serialize is out of sync; don't know field %s" % name)
    write("\n")

def deserialize_blob_map(source):
  source = (x.strip().split() for x in source)
  return dict((int(x[0].lstrip(':')), x[1]) for x in source)

def simple_dedup(records):
  # dedup via timestamp/author/msg
  dupes = {}
  for idx, record in enumerate(records):
    dupes.setdefault((record.timestamp, record.author, record.msg), []).append((idx, record))
  mangled = []
  for key, value in dupes.iteritems():
    if len(value) == 1:
      continue
    value.sort(key=operator.itemgetter(0))
    combined = value[0][1]
    for idx, item in value[1:]:
      combined = combined.safe_combine(item)
    value[:] = [(value[0][0], combined)]
    mangled.append((key, value))
  l = itertools.imap(operator.itemgetter(0), dupes.itervalues())
  return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0)))

def main(argv):
  records = []
  source = argv if argv else sys.stdin
  directories = [x.strip() for x in source]
  for directory in directories:
    tmp = os.path.join(directory, 'cvs2svn-tmp')
    commits = os.path.join(tmp, 'git-dump.dat')
    if not os.path.exists(commits):
      sys.stderr.write("skipping %s; no commit data\n" % directory)
      continue
    records.extend(
      deserialize_records(
        open(commits, 'r'),
        deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
      )
    )
  sorter = operator.attrgetter('timestamp')
  # Get them into timestamp ordering first; this is abusing python stable
  # sort pretty much since any commits to the same repo w/ the same timestamp
  # will still have their original ordering (just that chunk will be moved).
  # This allows us to combine the history w/out losing the ordering per repo.
  records.sort(key=sorter)
  records[:] = simple_dedup(records)
  serialize_records(records, sys.stdout)
  return 0

if __name__ == '__main__':
  sys.exit(main(sys.argv[1:]))