Transcode non-UTF-8 log messages
[rcs-fast-export] / rcs-fast-export.rb
1 #!/usr/bin/ruby
2
3 =begin
4 RCS fast export: run the script with the `--help` option for further
5 information.
6
7 No installation needed: you can run it from anywhere, including the git
8 checkout directory. For extra comfort, symlink it to some directory in
9 your PATH. I myself have this symlink:
10
11         ~/bin/rcs-fast-export -> ~/src/rcs-fast-export/rcs-fast-export.rb
12
13 allowing me to run `rcs-fast-export` from anywhere.
14 =end
15
16 =begin
17 TODO
18         * Refactor commit coalescing
19         * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
20         * Add support for commitid for coalescing commits
21         * Further coalescing options? (e.g. small logfile differences)
22         * Proper branching support in multi-file export
23         * Optimize memory usage by discarding unneeded text
24         * Provide an option that marks a file as deleted based on symbolic revisions
25 =end
26
27 require 'pp'
28 require 'set'
29
30 class NoBranchSupport < NotImplementedError ; end
31
32 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
33 # older versions
34 unless 2.respond_to? :odd?
35         class Integer
36                 def odd?
37                         self % 2 == 1
38                 end
39         end
40 end
41
42 # Set standard output to binary mode: git fast-import doesn't like Windows
43 # line-endings, and this ensures that the line termination will be a simple 0x0a
44 # on Windows too (it expands to 0x0D 0x0A otherwise).
45 STDOUT.binmode
46
47 =begin
48 RCS fast-export version: set to `git` in the repository, but can be overridden
49 by packagers, e.g. based on the latest tag, git description, custom packager
50 patches or whatever.
51
52 When the version is set to `git`, we make a little effort to find more information
53 about which commit we are at.
54 =end
55
56 RFE_VERSION="git"
57
58 def version
59         if RFE_VERSION == "git"
60                 nolinkfile = File.readlink(__FILE__) rescue __FILE__
61                 Dir.chdir File.expand_path File.dirname nolinkfile
62
63                 if File.exists? '.git' ; begin
64                         git_out = `git log -1 --pretty="%h %H%n%ai" | git name-rev --stdin`.split("\n")
65                         hash=git_out.first.split.first
66                         branch=git_out.first.split('(').last.chomp(')')
67                         date=git_out.last.split.first
68                         changed=`git diff --no-ext-diff --quiet --exit-code`
69                         branch << "*" unless $?.success?
70                         info=" [#{branch}] #{hash} (#{date})"
71                 rescue
72                         info=" (no info)"
73                 end ; end
74
75                 STDERR.puts "#{$0}: RCS fast-export, #{RFE_VERSION} version#{info}"
76         else
77                 STDERR.puts "#{$0}: RCS fast-export, version #{RFE_VERSION}"
78         end
79 end
80
81 def usage
82         $stdout.flush
83         STDERR.puts <<EOM
84 #{$0} [options] file [file ...]
85
86 Fast-export the RCS history of one or more files. If a directory is specified,
87 all RCS-tracked files in the directory and its descendants are exported.
88
89 When importing single files, their pathname is discarded during import. When
90 importing directories, only the specified directory component is discarded.
91
92 When importing a single file, RCS commits are converted one by one. Otherwise,
93 some heuristics is used to determine how to coalesce commits touching different
94 files.
95
96 Currently, commits are coalesced if they share the exact same log and if their
97 date differs by no more than the user-specified fuzziness. Additionally, the
98 symbols in one of the commit must be a subset of the symbols in the other
99 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
100 false in the git configuration.
101
102 Typical usage:
103     git init && rcs-fast-export.rb . | git fast-import && git reset
104
105 Options:
106         --help, -h, -?          display this help text
107         --authors-file, -A      specify a file containing username = Full Name <email> mappings
108         --[no-]author-is-committer
109                                 use the author name and date as committer identity
110         --ignore                ignore the specified files (shell pattern)
111         --log-encoding          specify the encoding of log messages, for transcoding to UTF-8
112         --rcs-commit-fuzz       fuzziness in RCS commits to be considered a single one when
113                                 importing multiple files
114                                 (in seconds, defaults to 300, i.e. 5 minutes)
115         --[no-]warn-missing-authors
116                                 [do not] warn about usernames missing from the map file
117         --[no-]symbol-check     [do not] check symbols when coalescing commits
118         --[no-]tag-each-rev     [do not] create a lightweight tag for each RCS revision when
119                                 importing a single file
120         --[no-]log-filename     [do not] prepend the filename to the commit log when importing
121                                 a single file
122         --skip-branches         when exporting multiple files with a branched history, export
123                                 the main branch only instead of aborting due to the lack of
124                                 support for branched multi-file history export
125
126
127
128 Config options:
129         rcs.authorsFile         for --authors-file
130         rcs.authorIsCommitter   for --author-is-committer
131         rcs.tagEachRev          for --tag-each-rev
132         rcs.logFilename         for --log-filename
133         rcs.commitFuzz          for --rcs-commit-fuzz
134         rcs.warnMissingAuthors  for --warn-missing-authors
135         rcs.symbolCheck         for --rcs-symbol-check
136         rcs.tagFuzz             for --rcs-tag-fuzz
137
138 EOM
139 end
140
141 def warning(msg)
142         $stdout.flush
143         STDERR.puts msg
144 end
145
146 def not_found(arg)
147         warning "Could not find #{arg}"
148 end
149
150 def emit_committer(opts, author, date)
151         if opts[:author_is_committer]
152                 committer = "#{author} #{date}"
153         else
154                 committer = `git var GIT_COMMITTER_IDENT`.chomp
155         end
156         puts "committer #{committer}"
157 end
158
159 # returns a hash that maps usernames to author names & emails
160 def load_authors_file(fn)
161         hash = {}
162         begin
163                 File.open(File.expand_path(fn)) do |io|
164                         io.each_line do |line|
165                                 uname, author = line.split('=', 2)
166                                 uname.strip!
167                                 author.strip!
168                                 warning "Username #{uname} redefined to #{author}" if hash.has_key? uname
169                                 hash[uname] = author
170                         end
171                 end
172         rescue
173                 not_found(fn)
174         end
175         return hash
176 end
177
178 def username_to_author(name, opts)
179         map = opts[:authors]
180         raise "no authors map defined" unless map and Hash === map
181
182         # if name is not found in map, provide a default one, optionally giving a warning (once)
183         unless map.key? name
184                 warning "no author found for #{name}" if opts[:warn_missing_authors]
185                 map[name] = "#{name} <empty>"
186         end
187         return map[name]
188 end
189
190 # display a message about a (recoverable) error
191 def alert(msg, action)
192         STDERR.puts "ERROR:\t#{msg}"
193         STDERR.puts "\t#{action}"
194 end
195
196 class Time
197         def Time.rcs(string)
198                 fields = string.split('.')
199                 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
200                 # in Ruby 1.9, '99' is interpreted as year 99, not year 1999
201                 if fields.first.length < 3
202                         fields.first.insert 0, '19'
203                 end
204                 Time.utc(*fields)
205         end
206 end
207
208 module RCS
209         # strip an optional final ;
210         def RCS.clean(arg)
211                 arg.chomp(';')
212         end
213
214         # strip the first and last @, and de-double @@s
215         def RCS.sanitize(arg)
216                 case arg
217                 when Array
218                         ret = arg.dup
219                         raise 'malformed first line' unless ret.first[0,1] == '@'
220                         raise 'malformed last line' unless ret.last[-1,1] == '@'
221                         ret.first.sub!(/^@/,'')
222                         ret.last.sub!(/@$/,'')
223                         ret.map { |l| l.gsub('@@','@') }
224                 when String
225                         arg.chomp('@').sub(/^@/,'').gsub('@@','@')
226                 else
227                         raise
228                 end
229         end
230
231         # clean and sanitize
232         def RCS.at_clean(arg)
233                 RCS.sanitize RCS.clean(arg)
234         end
235
236         def RCS.mark(key)
237                 @@marks ||= {}
238                 if @@marks.key? key
239                         @@marks[key]
240                 else
241                         @@marks[key] = @@marks.length + 1
242                 end
243         end
244
245         def RCS.blob(file, rev)
246                 RCS.mark([file, rev])
247         end
248
249         def RCS.commit(commit)
250                 RCS.mark(commit)
251         end
252
253         class File
254                 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
255                 def initialize(fname, executable)
256                         @fname = fname.dup
257                         @head = nil
258                         @comment = nil
259                         @desc = []
260                         @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
261                         @mode = executable ? '755' : '644'
262                 end
263
264                 def has_revision?(rev)
265                         @revision.has_key?(rev) and not @revision[rev].author.nil?
266                 end
267
268                 def export_commits(opts={})
269                         counter = 0
270                         exported = []
271                         log_enc = opts[:log_encoding]
272                         until @revision.empty?
273                                 counter += 1
274
275                                 # a string sort is a very good candidate for
276                                 # export order, getting a miss only for
277                                 # multi-digit revision components
278                                 keys = @revision.keys.sort
279
280                                 warning "commit export loop ##{counter}"
281                                 warning "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
282                                 warning "\t#{keys.size} to export: #{keys.join(', ')}"
283
284                                 keys.each do |key|
285                                         rev = @revision[key]
286                                         # the parent commit is rev.next if we're on the
287                                         # master branch (rev.branch is nil) or
288                                         # rev.diff_base otherwise
289                                         from = rev.branch.nil? ? rev.next : rev.diff_base
290                                         # A commit can only be exported if it has no
291                                         # parent, or if the parent has been exported
292                                         # already. Skip this commit otherwise
293                                         if from and not exported.include? from
294                                                 next
295                                         end
296
297                                         branch = rev.branch || 'master'
298                                         author = username_to_author(rev.author, opts)
299                                         date = "#{rev.date.tv_sec} +0000"
300                                         log = String.new
301                                         if opts[:log_filename]
302                                                 log << @fname << ": "
303                                         end
304                                         if log_enc
305                                                 # git fast-import expects logs to be in UTF-8, so if a different log encoding
306                                                 # is specified for the log we transcode from whatever was specified to UTF-8.
307                                                 # we then mark the string as ASCII-8BIT (as everything else) so that string
308                                                 # lengths are computed in bytes
309                                                 log << rev.log.join.encode('UTF-8', log_enc).force_encoding('ASCII-8BIT')
310                                         else
311                                                 log << rev.log.join
312                                         end
313
314                                         puts "commit refs/heads/#{branch}"
315                                         puts "mark :#{RCS.commit key}"
316                                         puts "author #{author} #{date}"
317                                         emit_committer(opts, author, date)
318                                         puts "data #{log.length}"
319                                         puts log unless log.empty?
320                                         puts "from :#{RCS.commit from}" if from
321                                         puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
322
323                                         # TODO FIXME this *should* be safe, in
324                                         # that it should not unduly move
325                                         # branches back in time, but I'm not
326                                         # 100% sure ...
327                                         rev.branches.each do |sym|
328                                                 puts "reset refs/heads/#{sym}"
329                                                 puts "from :#{RCS.commit key}"
330                                         end
331                                         rev.symbols.each do |sym|
332                                                 puts "reset refs/tags/#{sym}"
333                                                 puts "from :#{RCS.commit key}"
334                                         end
335                                         if opts[:tag_each_rev]
336                                                 puts "reset refs/tags/#{key}"
337                                                 puts "from :#{RCS.commit key}"
338                                         end
339
340                                         exported.push key
341                                 end
342                                 exported.each { |k| @revision.delete(k) }
343                         end
344                 end
345         end
346
347         class Revision
348                 attr_accessor :rev, :author, :state, :next
349                 attr_accessor :branches, :log, :text, :symbols
350                 attr_accessor :branch, :diff_base, :branch_point
351                 attr_reader   :date
352                 def initialize(file, rev)
353                         @file = file
354                         @rev = rev
355                         @author = nil
356                         @date = nil
357                         @state = nil
358                         @next = nil
359                         @branches = Set.new
360                         @branch = nil
361                         @branch_point = nil
362                         @diff_base = nil
363                         @log = []
364                         @text = []
365                         @symbols = Set.new
366                 end
367
368                 def date=(str)
369                         @date = Time.rcs(str)
370                 end
371
372                 def blob
373                         str = @text.join('')
374                         ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
375                         ret
376                 end
377         end
378
379         # TODO: what if a revision does not end with newline?
380         # TODO this should be done internally, not piping out to RCS
381         def RCS.expand_keywords(rcsfile, revision)
382                 ret = ::File.read("|co -q -p#{revision} #{rcsfile}")
383                 lines = []
384                 ret.each_line do |line|
385                         lines << line
386                 end
387                 lines
388         end
389
390         def RCS.parse(fname, rcsfile, opts={})
391                 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
392
393                 ::File.open(rcsfile, 'r:ASCII-8BIT') do |file|
394                         status = [:basic]
395                         rev = nil
396                         lines = []
397                         difflines = []
398                         file.each_line do |line|
399                                 case status.last
400                                 when :basic
401                                         command, args = line.split($;,2)
402                                         next if command.empty?
403
404                                         if command.chomp!(';')
405                                                 warning "Skipping empty command #{command.inspect}" if $DEBUG
406                                                 next
407                                         end
408
409                                         case command
410                                         when 'head'
411                                                 rcs.head = RCS.clean(args.chomp)
412                                         when 'symbols'
413                                                 status.push :symbols
414                                                 next if args.empty?
415                                                 line = args; redo
416                                         when 'comment'
417                                                 rcs.comment = RCS.at_clean(args.chomp)
418                                         when /^[0-9.]+$/
419                                                 rev = command.dup
420                                                 if rcs.has_revision?(rev)
421                                                         status.push :revision_data
422                                                 else
423                                                         status.push :new_revision
424                                                 end
425                                         when 'desc'
426                                                 status.push :desc
427                                                 lines.clear
428                                                 status.push :read_lines
429                                         when 'branch', 'access', 'locks', 'expand'
430                                                 warning "Skipping unhandled command #{command.inspect}" if $DEBUG
431                                                 status.push :skipping_lines
432                                                 next if args.empty?
433                                                 line = args; redo
434                                         else
435                                                 raise "Unknown command #{command.inspect}"
436                                         end
437                                 when :skipping_lines
438                                         status.pop if line.strip.chomp!(';')
439                                 when :symbols
440                                         # we can have multiple symbols per line
441                                         pairs = line.strip.split($;)
442                                         pairs.each do |pair|
443                                                 sym, rev = pair.strip.split(':',2);
444                                                 if rev
445                                                         status.pop if rev.chomp!(';')
446                                                         rcs.revision[rev].symbols << sym
447                                                 else
448                                                         status.pop
449                                                 end
450                                         end
451                                 when :desc
452                                         rcs.desc.replace lines.dup
453                                         status.pop
454                                 when :read_lines
455                                         # we sanitize lines as we read them
456
457                                         actual_line = line.dup
458
459                                         # the first line must begin with a @, which we strip
460                                         if lines.empty?
461                                                 ats = line.match(/^@+/)
462                                                 raise 'malformed line' unless ats
463                                                 actual_line.replace line.sub(/^@/,'')
464                                         end
465
466                                         # if the line ends with an ODD number of @, it's the
467                                         # last line -- we work on actual_line so that content
468                                         # such as @\n or @ work correctly (they would be
469                                         # encoded respectively as ['@@@\n','@\n'] and
470                                         # ['@@@@\n']
471                                         ats = actual_line.chomp.match(/@+$/)
472                                         if nomore = (ats && Regexp.last_match(0).length.odd?)
473                                                 actual_line.replace actual_line.chomp.sub(/@$/,'')
474                                         end
475                                         lines << actual_line.gsub('@@','@')
476                                         if nomore
477                                                 status.pop
478                                                 redo
479                                         end
480                                 when :new_revision
481                                         case line.chomp
482                                         when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
483                                                 rcs.revision[rev].date = $1
484                                                 rcs.revision[rev].author = $2
485                                                 rcs.revision[rev].state = $3
486                                         when /^branches\s*;/
487                                                 next
488                                         when /^branches(?:\s+|$)/
489                                                 status.push :branches
490                                                 if line.index(';')
491                                                         line = line.sub(/^branches\s+/,'')
492                                                         redo
493                                                 end
494                                         when /^next\s+(\S+)?;$/
495                                                 nxt = rcs.revision[rev].next = $1
496                                                 next unless nxt
497                                                 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
498                                                 rcs.revision[nxt].diff_base = rev
499                                                 rcs.revision[nxt].branch = rcs.revision[rev].branch
500                                         else
501                                                 status.pop
502                                         end
503                                 when :branches
504                                         candidate = line.split(';',2)
505                                         candidate.first.strip.split.each do |branch|
506                                                 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
507                                                 rcs.revision[branch].diff_base = rev
508                                                 # we drop the last number from the branch name
509                                                 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
510                                                 rcs.revision[branch].branch_point = rev
511                                         end
512                                         status.pop if candidate.length > 1
513                                 when :revision_data
514                                         case line.chomp
515                                         when 'log'
516                                                 status.push :log
517                                                 lines.clear
518                                                 status.push :read_lines
519                                         when 'text'
520                                                 if rev == rcs.head
521                                                         status.push :head
522                                                 else
523                                                         status.push :diff
524                                                 end
525                                                 lines.clear
526                                                 status.push :read_lines
527                                         else
528                                                 status.pop
529                                         end
530                                 when :log
531                                         rcs.revision[rev].log.replace lines.dup
532                                         status.pop
533                                 when :head
534                                         if opts[:expand_keywords]
535                                                 rcs.revision[rev].text.replace RCS.expand_keywords(rcsfile, rev)
536                                         else
537                                                 rcs.revision[rev].text.replace lines.dup
538                                         end
539                                         puts rcs.revision[rev].blob
540                                         status.pop
541                                 when :diff
542                                         if opts[:expand_keywords]
543                                                 rcs.revision[rev].text.replace RCS.expand_keywords(rcsfile, rev)
544                                         else
545                                                 difflines.replace lines.dup
546                                                 difflines.pop if difflines.last.empty?
547                                                 if difflines.first.chomp.empty?
548                                                         alert "malformed diff: empty initial line @ #{rcsfile}:#{file.lineno-difflines.length-1}", "skipping"
549                                                         difflines.shift
550                                                 end unless difflines.empty?
551                                                 base = rcs.revision[rev].diff_base
552                                                 unless rcs.revision[base].text
553                                                         pp rcs
554                                                         puts rev, base
555                                                         raise 'no diff base!'
556                                                 end
557                                                 # deep copy
558                                                 buffer = []
559                                                 rcs.revision[base].text.each { |l| buffer << [l.dup] }
560
561                                                 adding = false
562                                                 index = nil
563                                                 count = nil
564
565                                                 while l = difflines.shift
566                                                         if adding
567                                                                 raise 'negative index during insertion' if index < 0
568                                                                 raise 'negative count during insertion' if count < 0
569                                                                 adding << l
570                                                                 count -= 1
571                                                                 # collected all the lines, put the before
572                                                                 unless count > 0
573                                                                         unless buffer[index]
574                                                                                 buffer[index] = []
575                                                                         end
576                                                                         buffer[index].unshift(*adding)
577                                                                         adding = false
578                                                                 end
579                                                                 next
580                                                         end
581
582                                                         l.chomp!
583                                                         raise "malformed diff @ #{rcsfile}:#{file.lineno-difflines.length-1} `#{l}`" unless l =~ /^([ad])(\d+) (\d+)$/
584                                                         diff_cmd = $1.intern
585                                                         index = $2.to_i
586                                                         count = $3.to_i
587                                                         case diff_cmd
588                                                         when :d
589                                                                 # for deletion, index 1 is the first index, so the Ruby
590                                                                 # index is one less than the diff one
591                                                                 index -= 1
592                                                                 # we replace them with empty string so that 'a' commands
593                                                                 # referring to the same line work properly
594                                                                 while count > 0
595                                                                         buffer[index].clear
596                                                                         index += 1
597                                                                         count -= 1
598                                                                 end
599                                                         when :a
600                                                                 # addition will prepend the appropriate lines
601                                                                 # to the given index, and in this case Ruby
602                                                                 # and diff indices are the same
603                                                                 adding = []
604                                                         end
605                                                 end
606
607                                                 # turn the buffer into an array of lines, deleting the empty ones
608                                                 buffer.delete_if { |l| l.empty? }
609                                                 buffer.flatten!
610
611                                                 rcs.revision[rev].text = buffer
612                                         end
613                                         puts rcs.revision[rev].blob
614                                         status.pop
615                                 else
616                                         raise "Unknown status #{status.last}"
617                                 end
618                         end
619                 end
620
621                 # clean up the symbols/branches: look for revisions that have
622                 # one or more symbols but no dates, and make them into
623                 # branches, pointing to the highest commit with that key
624                 branches = []
625                 keys = rcs.revision.keys
626                 rcs.revision.each do |key, rev|
627                         if rev.date.nil? and not rev.symbols.empty?
628                                 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
629                                 tr = rcs.revision[top]
630                                 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
631                                 tr.branches |= rev.symbols
632                                 branches << key
633                         end
634                 end
635                 branches.each { |k| rcs.revision.delete k }
636
637                 return rcs
638         end
639
640         class Tree
641                 def initialize(commit)
642                         @commit = commit
643                         @files = Hash.new
644                 end
645
646                 def merge!(tree)
647                         testfiles = @files.dup
648                         tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
649                         # the next line is only reached if all the adds were
650                         # successful, so the merge is atomic
651                         @files.replace testfiles
652                 end
653
654                 def add(rcs, rev, file_list=@files)
655                         if file_list.key? rcs
656                                 prev = file_list[rcs]
657                                 if prev.log == rev.log
658                                         str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
659                                 else
660                                         str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
661                                 end
662                                 if prev.text != rev.text
663                                         raise str
664                                 else
665                                         @commit.warn_about str
666                                 end
667                         end
668                         file_list[rcs] = rev
669                 end
670
671                 def each &block
672                         @files.each &block
673                 end
674
675                 def to_a
676                         files = []
677                         @files.map do |rcs, rev|
678                                 if rev.state.downcase == "dead"
679                                         files << "D #{rcs.fname}"
680                                 else
681                                         files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
682                                 end
683                         end
684                         files
685                 end
686
687                 def filenames
688                         @files.map { |rcs, rev| rcs.fname }
689                 end
690
691                 def to_s
692                         self.to_a.join("\n")
693                 end
694         end
695
696         class Commit
697                 attr_accessor :date, :log, :symbols, :author, :branch
698                 attr_accessor :tree
699                 attr_accessor :min_date, :max_date
700                 def initialize(rcs, rev)
701                         raise NoBranchSupport if rev.branch
702                         self.date = rev.date.dup
703                         self.min_date = self.max_date = self.date
704                         self.log = rev.log.dup
705                         self.symbols = rev.symbols.dup
706                         self.author = rev.author
707                         self.branch = rev.branch
708
709                         self.tree = Tree.new self
710                         self.tree.add rcs, rev
711                 end
712
713                 def to_a
714                         [self.min_date, self.date, self.max_date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
715                 end
716
717                 def warn_about(str)
718                         warn str + " for commit on #{self.date}"
719                 end
720
721                 # Sort by date and then by number of symbols
722                 def <=>(other)
723                         ds = self.date <=> other.date
724                         if ds != 0
725                                 return ds
726                         else
727                                 return self.symbols.length <=> other.symbols.length
728                         end
729                 end
730
731                 def merge!(commit)
732                         self.tree.merge! commit.tree
733                         if commit.max_date > self.max_date
734                                 self.max_date = commit.max_date
735                         end
736                         if commit.min_date < self.min_date
737                                 self.min_date = commit.min_date
738                         end
739                         self.symbols.merge commit.symbols
740                 end
741
742                 def export(opts={})
743                         xbranch = self.branch || 'master'
744                         xauthor = username_to_author(self.author, opts)
745                         xlog = self.log.join
746                         numdate = self.date.tv_sec
747                         xdate = "#{numdate} +0000"
748                         key = numdate.to_s
749
750                         puts "commit refs/heads/#{xbranch}"
751                         puts "mark :#{RCS.commit key}"
752                         puts "author #{xauthor} #{xdate}"
753                         emit_committer(opts, xauthor, xdate)
754                         puts "data #{xlog.length}"
755                         puts xlog unless xlog.empty?
756                         # TODO branching support for multi-file export
757                         # puts "from :#{RCS.commit from}" if self.branch_point
758                         puts self.tree.to_s
759
760                         # TODO branching support for multi-file export
761                         # rev.branches.each do |sym|
762                         #       puts "reset refs/heads/#{sym}"
763                         #       puts "from :#{RCS.commit key}"
764                         # end
765
766                         self.symbols.each do |sym|
767                                 puts "reset refs/tags/#{sym}"
768                                 puts "from :#{RCS.commit key}"
769                         end
770
771                 end
772         end
773 end
774
775 require 'getoptlong'
776
777 opts = GetoptLong.new(
778         # Authors file, like git-svn and git-cvsimport, more than one can be
779         # specified
780         ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
781         # Use author identity as committer identity?
782         ['--author-is-committer', GetoptLong::NO_ARGUMENT],
783         ['--no-author-is-committer', GetoptLong::NO_ARGUMENT],
784         # Use "co" to obtain the actual revision with keywords expanded.
785         ['--expand-keywords', GetoptLong::NO_ARGUMENT],
786         # RCS file suffix, like RCS
787         ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
788         # Shell pattern to identify files to be ignored
789         ['--ignore', GetoptLong::REQUIRED_ARGUMENT],
790         # Encoding of log messages in the RCS files
791         ['--log-encoding', GetoptLong::REQUIRED_ARGUMENT],
792         # Date fuzziness for commits to be considered the same (in seconds)
793         ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
794         # warn about usernames missing in authors file map?
795         ['--warn-missing-authors', GetoptLong::NO_ARGUMENT],
796         ['--no-warn-missing-authors', GetoptLong::NO_ARGUMENT],
797         # check symbols when coalescing?
798         ['--symbol-check', GetoptLong::NO_ARGUMENT],
799         ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
800         # tag each revision?
801         ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
802         ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
803         # prepend filenames to commit logs?
804         ['--log-filename', GetoptLong::NO_ARGUMENT],
805         ['--no-log-filename', GetoptLong::NO_ARGUMENT],
806         # skip branches when exporting a whole tree?
807         ['--skip-branches', GetoptLong::NO_ARGUMENT],
808         # show current version
809         ['--version', '-v', GetoptLong::NO_ARGUMENT],
810         # show help/usage
811         ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
812 )
813
814 # We read options in order, but they apply to all passed parameters.
815 # TODO maybe they should only apply to the following, unless there's only one
816 # file?
817 opts.ordering = GetoptLong::RETURN_IN_ORDER
818
819 file_list = []
820 parse_options = {
821         :authors => Hash.new,
822         :ignore => Array.new,
823         :commit_fuzz => 300,
824         :tag_fuzz => -1,
825 }
826
827 # Read config options
828 `git config --get-all rcs.authorsfile`.each_line do |fn|
829         parse_options[:authors].merge! load_authors_file(fn.chomp)
830 end
831
832 parse_options[:author_is_committer] = (
833         `git config --bool rcs.authoriscommitter`.chomp == 'false'
834 ) ? false : true
835
836 parse_options[:tag_each_rev] = (
837         `git config --bool rcs.tageachrev`.chomp == 'true'
838 ) ? true : false
839
840 parse_options[:log_filename] = (
841         `git config --bool rcs.logfilename`.chomp == 'true'
842 ) ? true : false
843
844 fuzz = `git config --int rcs.commitFuzz`.chomp
845 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
846
847 fuzz = `git config --int rcs.tagFuzz`.chomp
848 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
849
850 parse_options[:symbol_check] = (
851         `git config --bool rcs.symbolcheck`.chomp == 'false'
852 ) ? false : true
853
854 parse_options[:warn_missing_authors] = (
855         `git config --bool rcs.warnmissingauthors`.chomp == 'false'
856 ) ? false : true
857
858 opts.each do |opt, arg|
859         case opt
860         when '--authors-file'
861                 authors = load_authors_file(arg)
862                 redef = parse_options[:authors].keys & authors.keys
863                 warning "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
864                 parse_options[:authors].merge!(authors)
865         when '--author-is-committer'
866                 parse_options[:author_is_committer] = true
867         when '--no-author-is-committer'
868                 parse_options[:author_is_committer] = false
869         when '--expand-keywords'
870                 parse_options[:expand_keywords] = true
871         when '--rcs-suffixes'
872                 # TODO
873         when '--ignore'
874                 parse_options[:ignore] << arg
875         when '--log-encoding'
876                 parse_options[:log_encoding] = Encoding.find(arg)
877         when '--rcs-commit-fuzz'
878                 parse_options[:commit_fuzz] = arg.to_i
879         when '--rcs-tag-fuzz'
880                 parse_options[:tag_fuzz] = arg.to_i
881         when '--symbol-check'
882                 parse_options[:symbol_check] = true
883         when '--no-symbol-check'
884                 parse_options[:symbol_check] = false
885         when '--tag-each-rev'
886                 parse_options[:tag_each_rev] = true
887         when '--no-tag-each-rev'
888                 # this is the default, which is fine since the missing key
889                 # (default) returns nil which is false in Ruby
890                 parse_options[:tag_each_rev] = false
891         when '--log-filename'
892                 parse_options[:log_filename] = true
893         when '--no-log-filename'
894                 # this is the default, which is fine since the missing key
895                 # (default) returns nil which is false in Ruby
896                 parse_options[:log_filename] = false
897         when '--skip-branches'
898                 parse_options[:skip_branches] = true
899         when ''
900                 file_list << arg
901         when '--version'
902                 version
903                 exit
904         when '--help'
905                 usage
906                 exit
907         end
908 end
909
910 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
911         parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
912 end
913
914 require 'etc'
915
916 user = Etc.getlogin || ENV['USER']
917
918 # steal username/email data from other init files that may contain the
919 # information
920 def steal_username
921         [
922                 # the user's .hgrc file for a username field
923                 ['~/.hgrc',   /^\s*username\s*=\s*(["'])?(.*)\1$/,       2],
924                 # the user's .(g)vimrc for a changelog_username setting
925                 ['~/.vimrc',  /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
926                 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
927         ].each do |fn, rx, idx|
928                 file = File.expand_path fn
929                 if File.readable?(file) and File.read(file) =~ rx
930                         parse_options[:authors][user] = Regexp.last_match(idx).strip
931                         break
932                 end
933         end
934 end
935
936 if user and not user.empty? and not parse_options[:authors].has_key?(user)
937         name = ENV['GIT_AUTHOR_NAME'] || ''
938         name.replace(`git config user.name`.chomp) if name.empty?
939         name.replace(Etc.getpwnam(user).gecos) if name.empty?
940
941         if name.empty?
942                 # couldn't find a name, try to steal data from other sources
943                 steal_username
944         else
945                 # if we found a name, try to find an email too
946                 email = ENV['GIT_AUTHOR_EMAIL'] || ''
947                 email.replace(`git config user.email`.chomp) if email.empty?
948
949                 if email.empty?
950                         # couldn't find an email, try to steal data too
951                         steal_username
952                 else
953                         # we got both a name and email, fill the info
954                         parse_options[:authors][user] = "#{name} <#{email}>"
955                 end
956         end
957 end
958
959 if file_list.empty?
960         usage
961         exit 1
962 end
963
964 SFX = ',v'
965
966 status = 0
967
968 rcs = []
969 file_list.each do |arg|
970         case ftype = File.ftype(arg)
971         when 'file'
972                 if arg[-2,2] == SFX
973                         if File.exists? arg
974                                 rcsfile = arg.dup
975                         else
976                                 not_found "RCS file #{arg}"
977                                 status |= 1
978                         end
979                         filename = File.basename(arg, SFX)
980                 else
981                         filename = File.basename(arg)
982                         path = File.dirname(arg)
983                         rcsfile = File.join(path, 'RCS', filename) + SFX
984                         unless File.exists? rcsfile
985                                 rcsfile.replace File.join(path, filename) + SFX
986                                 unless File.exists? rcsfile
987                                         not_found "RCS file for #{filename} in #{path}"
988                                 end
989                         end
990                 end
991                 rcs << RCS.parse(filename, rcsfile, parse_options)
992         when 'directory'
993                 argdirname = arg.chomp(File::SEPARATOR)
994                 pattern = File.join(argdirname, '**', '*' + SFX)
995                 Dir.glob(pattern, File::FNM_DOTMATCH).each do |rcsfile|
996                         filename = File.basename(rcsfile, SFX)
997                         path = File.dirname(rcsfile)
998                         # strip trailing "/RCS" if present, or "RCS" if that's
999                         # the full path
1000                         path.sub!(/(^|#{File::SEPARATOR})RCS$/, '')
1001                         # strip off the portion of the path specified
1002                         # on the command line from the front of the path
1003                         # (or delete the path completely if it is the same
1004                         # as the specified directory)
1005                         path.sub!(/^#{Regexp.escape argdirname}(#{File::SEPARATOR}|$)/, '')
1006                         filename = File.join(path, filename) unless path.empty?
1007
1008                         # skip file if it's to be ignored
1009                         unless parse_options[:ignore].empty?
1010                                 ignored = false
1011                                 parse_options[:ignore].each do |pat|
1012                                         if File.fnmatch?(pat, filename, File::FNM_PATHNAME)
1013                                                 ignored = true
1014                                                 break
1015                                         end
1016                                 end
1017                                 next if ignored
1018                         end
1019
1020                         # proceed
1021                         begin
1022                                 rcs << RCS.parse(filename, rcsfile, parse_options)
1023                         rescue Exception => e
1024                                 warning "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
1025                                 raise e
1026                         end
1027                 end
1028         else
1029                 warning "Cannot handle #{arg} of #{ftype} type"
1030                 status |= 1
1031         end
1032 end
1033
1034 if rcs.length == 1
1035         rcs.first.export_commits(parse_options)
1036 else
1037         warning "Preparing commits"
1038
1039         commits = []
1040
1041         rcs.each do |r|
1042                 r.revision.each do |k, rev|
1043                         begin
1044                                 commits << RCS::Commit.new(r, rev)
1045                         rescue NoBranchSupport
1046                                 if parse_options[:skip_branches]
1047                                         warning "Skipping revision #{rev.rev} for #{r.fname} (branch)"
1048                                 else raise
1049                                 end
1050                         end
1051                 end
1052         end
1053
1054         warning "Sorting by date"
1055
1056         commits.sort!
1057
1058         if $DEBUG
1059                 warning "RAW commits (#{commits.length}):"
1060                 commits.each do |c|
1061                         PP.pp c.to_a, $stderr
1062                 end
1063         else
1064                 warning "#{commits.length} single-file commits"
1065         end
1066
1067         warning "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
1068
1069         thisindex = commits.size
1070         commits.reverse_each do |c|
1071                 nextindex = thisindex
1072                 thisindex -= 1
1073
1074                 cfiles = Set.new c.tree.filenames
1075                 ofiles = Set.new
1076
1077                 mergeable = []
1078
1079                 # test for mergeable commits by looking at following commits
1080                 while nextindex < commits.size
1081                         k = commits[nextindex]
1082                         nextindex += 1
1083
1084                         # commits are date-sorted, so we know we can quit early if we are too far
1085                         # for coalescing to work
1086                         break if k.min_date > c.max_date + parse_options[:commit_fuzz]
1087
1088                         skipthis = false
1089
1090                         kfiles = Set.new k.tree.filenames
1091
1092                         if c.log != k.log or c.author != k.author or c.branch != k.branch
1093                                 skipthis = true
1094                         end
1095
1096                         unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
1097                                 cflist = cfiles.to_a.join(', ')
1098                                 kflist = kfiles.to_a.join(', ')
1099                                 if parse_options[:symbol_check]
1100                                         warning "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
1101                                         warning "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
1102                                         warning "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
1103                                         skipthis = true
1104                                 elsif $DEBUG
1105                                         warning "Coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
1106                                         warning "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
1107                                 end
1108                         end
1109
1110                         # keep track of filenames touched by commits we are not merging with,
1111                         # since we don't want to merge with commits that touch them, to preserve
1112                         # the monotonicity of history for each file
1113                         # TODO we could forward-merge with them, unless some of our files were
1114                         # touched too.
1115                         if skipthis
1116                                 # if the candidate touches any file already in the commit,
1117                                 # we can stop looking forward
1118                                 break unless cfiles.intersection(kfiles).empty?
1119                                 ofiles |= kfiles
1120                                 next
1121                         end
1122
1123                         # the candidate has the same log, author, branch and appropriate symbols
1124                         # does it touch anything in ofiles?
1125                         unless ofiles.intersection(kfiles).empty?
1126                                 if $DEBUG
1127                                         cflist = cfiles.to_a.join(', ')
1128                                         kflist = kfiles.to_a.join(', ')
1129                                         oflist = ofiles.to_a.join(', ')
1130                                         warning "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
1131                                         warning "\tbecause the latter intersects #{oflist} in #{(ofiles & kfiles).to_a.inspect}"
1132                                 end
1133                                 next
1134                         end
1135
1136                         mergeable << k
1137                 end
1138
1139                 mergeable.each do |k|
1140                         begin
1141                                 c.merge! k
1142                         rescue RuntimeError => err
1143                                 fuzz = c.date - k.date
1144                                 warning "Fuzzy commit coalescing failed: #{err}"
1145                                 warning "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
1146                                 break
1147                         end
1148                         commits.delete k
1149                 end
1150         end
1151
1152         if $DEBUG
1153                 warning "[1] commits (#{commits.length}):"
1154                 commits.each do |c|
1155                         PP.pp c.to_a, $stderr
1156                 end
1157         else
1158                 warning "#{commits.length} coalesced commits"
1159         end
1160
1161         commits.each { |c| c.export(parse_options) }
1162
1163 end
1164
1165 exit status