5 * Refactor commit coalescing
6 * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
7 * Add support for commitid for coalescing commits
8 * Further coalescing options? (e.g. small logfile differences)
9 * Proper branching support in multi-file export
10 * Optimize memory usage by discarding unneeded text
16 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
18 unless 2.respond_to? :odd?
28 #{$0} [options] file [file ...]
30 Fast-export the RCS history of one or more files. If a directory is specified,
31 all RCS-tracked files in the directory and its descendants are exported.
33 When importing single files, their pathname is discarded during import. When
34 importing directories, only the specified directory component is discarded.
36 When importing a single file, RCS commits are converted one by one. Otherwise,
37 some heuristics is used to determine how to coalesce commits touching different
40 Currently, commits are coalesced if they share the exact same log and if their
41 date differs by no more than the user-specified fuzziness. Additionally, the
42 symbols in one of the commit must be a subset of the symbols in the other
43 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
44 false in the git configuration.
47 git init && rcs-fast-export.rb . | git fast-import && git reset
50 --help, -h, -? display this help text
51 --authors-file, -A specify a file containing username = Full Name <email> mappings
52 --rcs-commit-fuzz fuzziness in RCS commits to be considered a single one when
53 importing multiple files
54 (in seconds, defaults to 300, i.e. 5 minutes)
55 --[no-]symbol-check [do not] check symbols when coalescing commits
56 --[no-]tag-each-rev [do not] create a lightweight tag for each RCS revision when
57 importing a single file
58 --[no-]log-filename [do not] prepend the filename to the commit log when importing
62 rcs.authorsFile for --authors-file
63 rcs.tagEachRev for --tag-each-rev
64 rcs.logFilename for --log-filename
65 rcs.commitFuzz for --rcs-commit-fuzz
66 rcs.symbolCheck for --rcs-symbol-check
67 rcs.tagFuzz for --rcs-tag-fuzz
73 STDERR.puts "Could not find #{arg}"
76 # returns a hash that maps usernames to author names & emails
77 def load_authors_file(fn)
80 File.open(File.expand_path(fn)) do |io|
81 io.each_line do |line|
82 uname, author = line.split('=', 2)
85 STDERR.puts "Username #{uname} redefined to #{author}" if hash.has_key? uname
97 fields = string.split('.')
98 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
104 # strip an optional final ;
109 # strip the first and last @, and de-double @@s
110 def RCS.sanitize(arg)
114 raise 'malformed first line' unless ret.first[0,1] == '@'
115 raise 'malformed last line' unless ret.last[-1,1] == '@'
116 ret.first.sub!(/^@/,'')
117 ret.last.sub!(/@$/,'')
118 ret.map { |l| l.gsub('@@','@') }
120 arg.chomp('@').sub(/^@/,'').gsub('@@','@')
127 def RCS.at_clean(arg)
128 RCS.sanitize RCS.clean(arg)
136 @@marks[key] = @@marks.length + 1
140 def RCS.blob(file, rev)
141 RCS.mark([file, rev])
144 def RCS.commit(commit)
149 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
150 def initialize(fname, executable)
155 @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
156 @mode = executable ? '755' : '644'
159 def has_revision?(rev)
160 @revision.has_key?(rev) and not @revision[rev].author.nil?
163 def export_commits(opts={})
166 until @revision.empty?
169 # a string sort is a very good candidate for
170 # export order, getting a miss only for
171 # multi-digit revision components
172 keys = @revision.keys.sort
174 STDERR.puts "commit export loop ##{counter}"
175 STDERR.puts "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
176 STDERR.puts "\t#{keys.size} to export: #{keys.join(', ')}"
180 # the parent commit is rev.next if we're on the
181 # master branch (rev.branch is nil) or
182 # rev.diff_base otherwise
183 from = rev.branch.nil? ? rev.next : rev.diff_base
184 # A commit can only be exported if it has no
185 # parent, or if the parent has been exported
186 # already. Skip this commit otherwise
187 if from and not exported.include? from
191 branch = rev.branch || 'master'
192 author = opts[:authors][rev.author] || "#{rev.author} <empty>"
193 date = "#{rev.date.tv_sec} +0000"
195 if opts[:log_filename]
196 log << @fname << ": "
200 puts "commit refs/heads/#{branch}"
201 puts "mark :#{RCS.commit key}"
202 puts "committer #{author} #{date}"
203 puts "data #{log.length}"
204 puts log unless log.empty?
205 puts "from :#{RCS.commit from}" if rev.branch_point
206 puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
208 # TODO FIXME this *should* be safe, in
209 # that it should not unduly move
210 # branches back in time, but I'm not
212 rev.branches.each do |sym|
213 puts "reset refs/heads/#{sym}"
214 puts "from :#{RCS.commit key}"
216 rev.symbols.each do |sym|
217 puts "reset refs/tags/#{sym}"
218 puts "from :#{RCS.commit key}"
220 if opts[:tag_each_rev]
221 puts "reset refs/tags/#{key}"
222 puts "from :#{RCS.commit key}"
227 exported.each { |k| @revision.delete(k) }
233 attr_accessor :rev, :author, :state, :next
234 attr_accessor :branches, :log, :text, :symbols
235 attr_accessor :branch, :diff_base, :branch_point
237 def initialize(file, rev)
254 @date = Time.rcs(str)
259 ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
264 def RCS.parse(fname, rcsfile)
265 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
267 ::File.open(rcsfile, 'r:ASCII-8BIT') do |file|
272 file.each_line do |line|
275 command, args = line.split($;,2)
276 next if command.empty?
278 if command.chomp!(';')
279 STDERR.puts "Skipping empty command #{command.inspect}" if $DEBUG
285 rcs.head = RCS.clean(args.chomp)
291 rcs.comment = RCS.at_clean(args.chomp)
294 if rcs.has_revision?(rev)
295 status.push :revision_data
297 status.push :new_revision
302 status.push :read_lines
303 when 'branch', 'access', 'locks', 'expand'
304 STDERR.puts "Skipping unhandled command #{command.inspect}" if $DEBUG
305 status.push :skipping_lines
309 raise "Unknown command #{command.inspect}"
312 status.pop if line.strip.chomp!(';')
314 # we can have multiple symbols per line
315 pairs = line.strip.split($;)
317 sym, rev = pair.strip.split(':',2);
319 status.pop if rev.chomp!(';')
320 rcs.revision[rev].symbols << sym
326 rcs.desc.replace lines.dup
329 # we sanitize lines as we read them
331 actual_line = line.dup
333 # the first line must begin with a @, which we strip
335 ats = line.match(/^@+/)
336 raise 'malformed line' unless ats
337 actual_line.replace line.sub(/^@/,'')
340 # if the line ends with an ODD number of @, it's the
341 # last line -- we work on actual_line so that content
342 # such as @\n or @ work correctly (they would be
343 # encoded respectively as ['@@@\n','@\n'] and
345 ats = actual_line.chomp.match(/@+$/)
346 if nomore = (ats && Regexp.last_match(0).length.odd?)
347 actual_line.replace actual_line.chomp.sub(/@$/,'')
349 lines << actual_line.gsub('@@','@')
356 when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
357 rcs.revision[rev].date = $1
358 rcs.revision[rev].author = $2
359 rcs.revision[rev].state = $3
361 status.push :branches
364 when /^next\s+(\S+)?;$/
365 nxt = rcs.revision[rev].next = $1
367 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
368 rcs.revision[nxt].diff_base = rev
369 rcs.revision[nxt].branch = rcs.revision[rev].branch
374 candidate = line.split(';',2)
375 branch = candidate.first.strip
376 rcs.revision[rev].branches.push branch
377 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
378 rcs.revision[branch].diff_base = rev
379 # we drop the last number from the branch name
380 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
381 rcs.revision[branch].branch_point = rev
382 status.pop if candidate.length > 1
388 status.push :read_lines
396 status.push :read_lines
401 rcs.revision[rev].log.replace lines.dup
404 rcs.revision[rev].text.replace lines.dup
405 puts rcs.revision[rev].blob
408 difflines.replace lines.dup
409 difflines.pop if difflines.last.empty?
410 base = rcs.revision[rev].diff_base
411 unless rcs.revision[base].text
414 raise 'no diff base!'
418 rcs.revision[base].text.each { |l| buffer << [l.dup] }
424 while l = difflines.shift
426 raise 'negative index during insertion' if index < 0
427 raise 'negative count during insertion' if count < 0
430 # collected all the lines, put the before
435 buffer[index].unshift(*adding)
442 raise 'malformed diff' unless l =~ /^([ad])(\d+) (\d+)$/
448 # for deletion, index 1 is the first index, so the Ruby
449 # index is one less than the diff one
451 # we replace them with empty string so that 'a' commands
452 # referring to the same line work properly
459 # addition will prepend the appropriate lines
460 # to the given index, and in this case Ruby
461 # and diff indices are the same
466 # turn the buffer into an array of lines, deleting the empty ones
467 buffer.delete_if { |l| l.empty? }
470 rcs.revision[rev].text = buffer
471 puts rcs.revision[rev].blob
474 raise "Unknown status #{status.last}"
479 # clean up the symbols/branches: look for revisions that have
480 # one or more symbols but no dates, and make them into
481 # branches, pointing to the highest commit with that key
483 keys = rcs.revision.keys
484 rcs.revision.each do |key, rev|
485 if rev.date.nil? and not rev.symbols.empty?
486 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
487 tr = rcs.revision[top]
488 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
489 tr.branches |= rev.symbols
493 branches.each { |k| rcs.revision.delete k }
499 def initialize(commit)
505 testfiles = @files.dup
506 tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
507 # the next line is only reached if all the adds were
508 # successful, so the merge is atomic
509 @files.replace testfiles
512 def add(rcs, rev, file_list=@files)
513 if file_list.key? rcs
514 prev = file_list[rcs]
515 if prev.log == rev.log
516 str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
518 str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
520 if prev.text != rev.text
523 @commit.warn_about str
535 @files.map do |rcs, rev|
536 files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
542 @files.map { |rcs, rev| rcs.fname }
551 attr_accessor :date, :log, :symbols, :author, :branch
553 def initialize(rcs, rev)
554 raise NotImplementedError if rev.branch
555 self.date = rev.date.dup
556 self.log = rev.log.dup
557 self.symbols = rev.symbols.dup
558 self.author = rev.author
559 self.branch = rev.branch
561 self.tree = Tree.new self
562 self.tree.add rcs, rev
566 [self.date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
570 warn str + " for commit on #{self.date}"
573 # Sort by date and then by number of symbols
575 ds = self.date <=> other.date
579 return self.symbols.length <=> other.symbols.length
584 self.tree.merge! commit.tree
585 if commit.date > self.date
586 warn_about "updating date to #{commit.date}"
587 self.date = commit.date
589 self.symbols.merge commit.symbols
593 xbranch = self.branch || 'master'
594 xauthor = opts[:authors][self.author] || "#{self.author} <empty>"
596 numdate = self.date.tv_sec
597 xdate = "#{numdate} +0000"
600 puts "commit refs/heads/#{xbranch}"
601 puts "mark :#{RCS.commit key}"
602 puts "committer #{xauthor} #{xdate}"
603 puts "data #{xlog.length}"
604 puts xlog unless xlog.empty?
605 # TODO branching support for multi-file export
606 # puts "from :#{RCS.commit from}" if self.branch_point
609 # TODO branching support for multi-file export
610 # rev.branches.each do |sym|
611 # puts "reset refs/heads/#{sym}"
612 # puts "from :#{RCS.commit key}"
615 self.symbols.each do |sym|
616 puts "reset refs/tags/#{sym}"
617 puts "from :#{RCS.commit key}"
626 opts = GetoptLong.new(
627 # Authors file, like git-svn and git-cvsimport, more than one can be
629 ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
630 # RCS file suffix, like RCS
631 ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
632 # Date fuzziness for commits to be considered the same (in seconds)
633 ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
634 # check symbols when coalescing?
635 ['--symbol-check', GetoptLong::NO_ARGUMENT],
636 ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
638 ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
639 ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
640 # prepend filenames to commit logs?
641 ['--log-filename', GetoptLong::NO_ARGUMENT],
642 ['--no-log-filename', GetoptLong::NO_ARGUMENT],
643 ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
646 # We read options in order, but they apply to all passed parameters.
647 # TODO maybe they should only apply to the following, unless there's only one
649 opts.ordering = GetoptLong::RETURN_IN_ORDER
653 :authors => Hash.new,
658 # Read config options
659 `git config --get-all rcs.authorsfile`.each_line do |fn|
660 parse_options[:authors].merge! load_authors_file(fn.chomp)
663 parse_options[:tag_each_rev] = (
664 `git config --bool rcs.tageachrev`.chomp == 'true'
667 parse_options[:log_filename] = (
668 `git config --bool rcs.logfilename`.chomp == 'true'
671 fuzz = `git config --int rcs.commitFuzz`.chomp
672 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
674 fuzz = `git config --int rcs.tagFuzz`.chomp
675 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
677 parse_options[:symbol_check] = (
678 `git config --bool rcs.symbolcheck`.chomp == 'false'
681 opts.each do |opt, arg|
683 when '--authors-file'
684 authors = load_authors_file(arg)
685 redef = parse_options[:authors].keys & authors.keys
686 STDERR.puts "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
687 parse_options[:authors].merge!(authors)
688 when '--rcs-suffixes'
690 when '--rcs-commit-fuzz'
691 parse_options[:commit_fuzz] = arg.to_i
692 when '--rcs-tag-fuzz'
693 parse_options[:tag_fuzz] = arg.to_i
694 when '--symbol-check'
695 parse_options[:symbol_check] = true
696 when '--no-symbol-check'
697 parse_options[:symbol_check] = false
698 when '--tag-each-rev'
699 parse_options[:tag_each_rev] = true
700 when '--no-tag-each-rev'
701 # this is the default, which is fine since the missing key
702 # (default) returns nil which is false in Ruby
703 parse_options[:tag_each_rev] = false
704 when '--log-filename'
705 parse_options[:log_filename] = true
706 when '--no-log-filename'
707 # this is the default, which is fine since the missing key
708 # (default) returns nil which is false in Ruby
709 parse_options[:log_filename] = false
718 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
719 parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
724 user = Etc.getlogin || ENV['USER']
726 # steal username/email data from other init files that may contain the
730 # the user's .hgrc file for a username field
731 ['~/.hgrc', /^\s*username\s*=\s*(["'])?(.*)\1$/, 2],
732 # the user's .(g)vimrc for a changelog_username setting
733 ['~/.vimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
734 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
736 ].each do |fn, rx, idx|
737 file = File.expand_path fn
738 if File.readable?(file) and File.read(file) =~ rx
739 parse_options[:authors][user] = Regexp.last_match(idx).strip
745 if user and not user.empty? and not parse_options[:authors].has_key?(user)
746 name = ENV['GIT_AUTHOR_NAME'] || ''
747 name.replace(`git config user.name`.chomp) if name.empty?
748 name.replace(Etc.getpwnam(user).gecos) if name.empty?
751 # couldn't find a name, try to steal data from other sources
754 # if we found a name, try to find an email too
755 email = ENV['GIT_AUTHOR_EMAIL'] || ''
756 email.replace(`git config user.email`.chomp) if email.empty?
759 # couldn't find an email, try to steal data too
762 # we got both a name and email, fill the info
763 parse_options[:authors][user] = "#{name} <#{email}>"
778 file_list.each do |arg|
779 case ftype = File.ftype(arg)
785 not_found "RCS file #{arg}"
788 filename = File.basename(arg, SFX)
790 filename = File.basename(arg)
791 path = File.dirname(arg)
792 rcsfile = File.join(path, 'RCS', filename) + SFX
793 unless File.exists? rcsfile
794 rcsfile.replace File.join(path, filename) + SFX
795 unless File.exists? rcsfile
796 not_found "RCS file for #{filename} in #{path}"
800 rcs << RCS.parse(filename, rcsfile)
802 pattern = File.join(arg, '**', '*' + SFX)
803 Dir.glob(pattern).each do |rcsfile|
804 filename = File.basename(rcsfile, SFX)
805 path = File.dirname(rcsfile)
806 path.sub!(/\/?RCS$/, '') # strip final /RCS if present
807 path.sub!(/^#{Regexp.escape arg}\/?/, '') # strip initial dirname
808 filename = File.join(path, filename) unless path.empty?
810 rcs << RCS.parse(filename, rcsfile)
811 rescue Exception => e
812 STDERR.puts "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
817 STDERR.puts "Cannot handle #{arg} of #{ftype} type"
823 rcs.first.export_commits(parse_options)
825 STDERR.puts "Preparing commits"
830 r.revision.each do |k, rev|
831 commits << RCS::Commit.new(r, rev)
835 STDERR.puts "Sorting by date"
840 STDERR.puts "RAW commits (#{commits.length}):"
842 PP.pp c.to_a, $stderr
845 STDERR.puts "#{commits.length} single-file commits"
848 STDERR.puts "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
850 commits.reverse_each do |c|
851 commits.reverse_each do |k|
852 break if k.date < c.date - parse_options[:commit_fuzz]
854 next if c.log != k.log or c.author != k.author or c.branch != k.branch
855 next if k.date > c.date
856 unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
857 if parse_options[:symbol_check]
858 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{c.tree.filenames.join(', ')})\n\tand (#{k.tree.filenames.join(', ')})"
859 STDERR.puts "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
860 STDERR.puts "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
863 STDERR.puts "Coalescing #{c.log.inspect}\n\tfor (#{c.tree.filenames.join(', ')})\n\tand (#{k.tree.filenames.join(', ')})"
864 STDERR.puts "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
869 rescue RuntimeError => err
870 fuzz = c.date - k.date
871 STDERR.puts "Fuzzy commit coalescing failed: #{err}"
872 STDERR.puts "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
880 STDERR.puts "[1] commits (#{commits.length}):"
882 PP.pp c.to_a, $stderr
885 STDERR.puts "#{commits.length} coalesced commits"
888 commits.each { |c| c.export(parse_options) }