5 * Refactor commit coalescing
6 * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
7 * Add support for commitid for coalescing commits
8 * Further coalescing options? (e.g. small logfile differences)
9 * Proper branching support in multi-file export
10 * Optimize memory usage by discarding unneeded text
16 class NoBranchSupport < NotImplementedError ; end
18 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
20 unless 2.respond_to? :odd?
30 #{$0} [options] file [file ...]
32 Fast-export the RCS history of one or more files. If a directory is specified,
33 all RCS-tracked files in the directory and its descendants are exported.
35 When importing single files, their pathname is discarded during import. When
36 importing directories, only the specified directory component is discarded.
38 When importing a single file, RCS commits are converted one by one. Otherwise,
39 some heuristics is used to determine how to coalesce commits touching different
42 Currently, commits are coalesced if they share the exact same log and if their
43 date differs by no more than the user-specified fuzziness. Additionally, the
44 symbols in one of the commit must be a subset of the symbols in the other
45 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
46 false in the git configuration.
49 git init && rcs-fast-export.rb . | git fast-import && git reset
52 --help, -h, -? display this help text
53 --authors-file, -A specify a file containing username = Full Name <email> mappings
54 --rcs-commit-fuzz fuzziness in RCS commits to be considered a single one when
55 importing multiple files
56 (in seconds, defaults to 300, i.e. 5 minutes)
57 --[no-]symbol-check [do not] check symbols when coalescing commits
58 --[no-]tag-each-rev [do not] create a lightweight tag for each RCS revision when
59 importing a single file
60 --[no-]log-filename [do not] prepend the filename to the commit log when importing
62 --skip-branches when exporting multiple files with a branched history, export
63 the main branch only instead of aborting due to the lack of
64 support for branched multi-file history export
69 rcs.authorsFile for --authors-file
70 rcs.tagEachRev for --tag-each-rev
71 rcs.logFilename for --log-filename
72 rcs.commitFuzz for --rcs-commit-fuzz
73 rcs.symbolCheck for --rcs-symbol-check
74 rcs.tagFuzz for --rcs-tag-fuzz
80 STDERR.puts "Could not find #{arg}"
83 # returns a hash that maps usernames to author names & emails
84 def load_authors_file(fn)
87 File.open(File.expand_path(fn)) do |io|
88 io.each_line do |line|
89 uname, author = line.split('=', 2)
92 STDERR.puts "Username #{uname} redefined to #{author}" if hash.has_key? uname
104 fields = string.split('.')
105 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
111 # strip an optional final ;
116 # strip the first and last @, and de-double @@s
117 def RCS.sanitize(arg)
121 raise 'malformed first line' unless ret.first[0,1] == '@'
122 raise 'malformed last line' unless ret.last[-1,1] == '@'
123 ret.first.sub!(/^@/,'')
124 ret.last.sub!(/@$/,'')
125 ret.map { |l| l.gsub('@@','@') }
127 arg.chomp('@').sub(/^@/,'').gsub('@@','@')
134 def RCS.at_clean(arg)
135 RCS.sanitize RCS.clean(arg)
143 @@marks[key] = @@marks.length + 1
147 def RCS.blob(file, rev)
148 RCS.mark([file, rev])
151 def RCS.commit(commit)
156 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
157 def initialize(fname, executable)
162 @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
163 @mode = executable ? '755' : '644'
166 def has_revision?(rev)
167 @revision.has_key?(rev) and not @revision[rev].author.nil?
170 def export_commits(opts={})
173 until @revision.empty?
176 # a string sort is a very good candidate for
177 # export order, getting a miss only for
178 # multi-digit revision components
179 keys = @revision.keys.sort
181 STDERR.puts "commit export loop ##{counter}"
182 STDERR.puts "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
183 STDERR.puts "\t#{keys.size} to export: #{keys.join(', ')}"
187 # the parent commit is rev.next if we're on the
188 # master branch (rev.branch is nil) or
189 # rev.diff_base otherwise
190 from = rev.branch.nil? ? rev.next : rev.diff_base
191 # A commit can only be exported if it has no
192 # parent, or if the parent has been exported
193 # already. Skip this commit otherwise
194 if from and not exported.include? from
198 branch = rev.branch || 'master'
199 author = opts[:authors][rev.author] || "#{rev.author} <empty>"
200 date = "#{rev.date.tv_sec} +0000"
202 if opts[:log_filename]
203 log << @fname << ": "
207 puts "commit refs/heads/#{branch}"
208 puts "mark :#{RCS.commit key}"
209 puts "committer #{author} #{date}"
210 puts "data #{log.length}"
211 puts log unless log.empty?
212 puts "from :#{RCS.commit from}" if from
213 puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
215 # TODO FIXME this *should* be safe, in
216 # that it should not unduly move
217 # branches back in time, but I'm not
219 rev.branches.each do |sym|
220 puts "reset refs/heads/#{sym}"
221 puts "from :#{RCS.commit key}"
223 rev.symbols.each do |sym|
224 puts "reset refs/tags/#{sym}"
225 puts "from :#{RCS.commit key}"
227 if opts[:tag_each_rev]
228 puts "reset refs/tags/#{key}"
229 puts "from :#{RCS.commit key}"
234 exported.each { |k| @revision.delete(k) }
240 attr_accessor :rev, :author, :state, :next
241 attr_accessor :branches, :log, :text, :symbols
242 attr_accessor :branch, :diff_base, :branch_point
244 def initialize(file, rev)
261 @date = Time.rcs(str)
266 ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
271 def RCS.parse(fname, rcsfile)
272 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
274 ::File.open(rcsfile, 'r:ASCII-8BIT') do |file|
279 file.each_line do |line|
282 command, args = line.split($;,2)
283 next if command.empty?
285 if command.chomp!(';')
286 STDERR.puts "Skipping empty command #{command.inspect}" if $DEBUG
292 rcs.head = RCS.clean(args.chomp)
298 rcs.comment = RCS.at_clean(args.chomp)
301 if rcs.has_revision?(rev)
302 status.push :revision_data
304 status.push :new_revision
309 status.push :read_lines
310 when 'branch', 'access', 'locks', 'expand'
311 STDERR.puts "Skipping unhandled command #{command.inspect}" if $DEBUG
312 status.push :skipping_lines
316 raise "Unknown command #{command.inspect}"
319 status.pop if line.strip.chomp!(';')
321 # we can have multiple symbols per line
322 pairs = line.strip.split($;)
324 sym, rev = pair.strip.split(':',2);
326 status.pop if rev.chomp!(';')
327 rcs.revision[rev].symbols << sym
333 rcs.desc.replace lines.dup
336 # we sanitize lines as we read them
338 actual_line = line.dup
340 # the first line must begin with a @, which we strip
342 ats = line.match(/^@+/)
343 raise 'malformed line' unless ats
344 actual_line.replace line.sub(/^@/,'')
347 # if the line ends with an ODD number of @, it's the
348 # last line -- we work on actual_line so that content
349 # such as @\n or @ work correctly (they would be
350 # encoded respectively as ['@@@\n','@\n'] and
352 ats = actual_line.chomp.match(/@+$/)
353 if nomore = (ats && Regexp.last_match(0).length.odd?)
354 actual_line.replace actual_line.chomp.sub(/@$/,'')
356 lines << actual_line.gsub('@@','@')
363 when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
364 rcs.revision[rev].date = $1
365 rcs.revision[rev].author = $2
366 rcs.revision[rev].state = $3
369 when /^branches(?:\s+|$)/
370 status.push :branches
372 line = line.sub(/^branches\s+/,'')
375 when /^next\s+(\S+)?;$/
376 nxt = rcs.revision[rev].next = $1
378 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
379 rcs.revision[nxt].diff_base = rev
380 rcs.revision[nxt].branch = rcs.revision[rev].branch
385 candidate = line.split(';',2)
386 branch = candidate.first.strip
387 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
388 rcs.revision[branch].diff_base = rev
389 # we drop the last number from the branch name
390 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
391 rcs.revision[branch].branch_point = rev
392 status.pop if candidate.length > 1
398 status.push :read_lines
406 status.push :read_lines
411 rcs.revision[rev].log.replace lines.dup
414 rcs.revision[rev].text.replace lines.dup
415 puts rcs.revision[rev].blob
418 difflines.replace lines.dup
419 difflines.pop if difflines.last.empty?
420 base = rcs.revision[rev].diff_base
421 unless rcs.revision[base].text
424 raise 'no diff base!'
428 rcs.revision[base].text.each { |l| buffer << [l.dup] }
434 while l = difflines.shift
436 raise 'negative index during insertion' if index < 0
437 raise 'negative count during insertion' if count < 0
440 # collected all the lines, put the before
445 buffer[index].unshift(*adding)
452 raise 'malformed diff' unless l =~ /^([ad])(\d+) (\d+)$/
458 # for deletion, index 1 is the first index, so the Ruby
459 # index is one less than the diff one
461 # we replace them with empty string so that 'a' commands
462 # referring to the same line work properly
469 # addition will prepend the appropriate lines
470 # to the given index, and in this case Ruby
471 # and diff indices are the same
476 # turn the buffer into an array of lines, deleting the empty ones
477 buffer.delete_if { |l| l.empty? }
480 rcs.revision[rev].text = buffer
481 puts rcs.revision[rev].blob
484 raise "Unknown status #{status.last}"
489 # clean up the symbols/branches: look for revisions that have
490 # one or more symbols but no dates, and make them into
491 # branches, pointing to the highest commit with that key
493 keys = rcs.revision.keys
494 rcs.revision.each do |key, rev|
495 if rev.date.nil? and not rev.symbols.empty?
496 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
497 tr = rcs.revision[top]
498 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
499 tr.branches |= rev.symbols
503 branches.each { |k| rcs.revision.delete k }
509 def initialize(commit)
515 testfiles = @files.dup
516 tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
517 # the next line is only reached if all the adds were
518 # successful, so the merge is atomic
519 @files.replace testfiles
522 def add(rcs, rev, file_list=@files)
523 if file_list.key? rcs
524 prev = file_list[rcs]
525 if prev.log == rev.log
526 str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
528 str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
530 if prev.text != rev.text
533 @commit.warn_about str
545 @files.map do |rcs, rev|
546 if rev.state.downcase == "dead"
547 files << "D #{rcs.fname}"
549 files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
556 @files.map { |rcs, rev| rcs.fname }
565 attr_accessor :date, :log, :symbols, :author, :branch
567 attr_accessor :min_date, :max_date
568 def initialize(rcs, rev)
569 raise NoBranchSupport if rev.branch
570 self.date = rev.date.dup
571 self.min_date = self.max_date = self.date
572 self.log = rev.log.dup
573 self.symbols = rev.symbols.dup
574 self.author = rev.author
575 self.branch = rev.branch
577 self.tree = Tree.new self
578 self.tree.add rcs, rev
582 [self.min_date, self.date, self.max_date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
586 warn str + " for commit on #{self.date}"
589 # Sort by date and then by number of symbols
591 ds = self.date <=> other.date
595 return self.symbols.length <=> other.symbols.length
600 self.tree.merge! commit.tree
601 if commit.max_date > self.max_date
602 self.max_date = commit.max_date
604 if commit.min_date < self.min_date
605 self.min_date = commit.min_date
607 self.symbols.merge commit.symbols
611 xbranch = self.branch || 'master'
612 xauthor = opts[:authors][self.author] || "#{self.author} <empty>"
614 numdate = self.date.tv_sec
615 xdate = "#{numdate} +0000"
618 puts "commit refs/heads/#{xbranch}"
619 puts "mark :#{RCS.commit key}"
620 puts "committer #{xauthor} #{xdate}"
621 puts "data #{xlog.length}"
622 puts xlog unless xlog.empty?
623 # TODO branching support for multi-file export
624 # puts "from :#{RCS.commit from}" if self.branch_point
627 # TODO branching support for multi-file export
628 # rev.branches.each do |sym|
629 # puts "reset refs/heads/#{sym}"
630 # puts "from :#{RCS.commit key}"
633 self.symbols.each do |sym|
634 puts "reset refs/tags/#{sym}"
635 puts "from :#{RCS.commit key}"
644 opts = GetoptLong.new(
645 # Authors file, like git-svn and git-cvsimport, more than one can be
647 ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
648 # RCS file suffix, like RCS
649 ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
650 # Date fuzziness for commits to be considered the same (in seconds)
651 ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
652 # check symbols when coalescing?
653 ['--symbol-check', GetoptLong::NO_ARGUMENT],
654 ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
656 ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
657 ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
658 # prepend filenames to commit logs?
659 ['--log-filename', GetoptLong::NO_ARGUMENT],
660 ['--no-log-filename', GetoptLong::NO_ARGUMENT],
661 # skip branches when exporting a whole tree?
662 ['--skip-branches', GetoptLong::NO_ARGUMENT],
663 ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
666 # We read options in order, but they apply to all passed parameters.
667 # TODO maybe they should only apply to the following, unless there's only one
669 opts.ordering = GetoptLong::RETURN_IN_ORDER
673 :authors => Hash.new,
678 # Read config options
679 `git config --get-all rcs.authorsfile`.each_line do |fn|
680 parse_options[:authors].merge! load_authors_file(fn.chomp)
683 parse_options[:tag_each_rev] = (
684 `git config --bool rcs.tageachrev`.chomp == 'true'
687 parse_options[:log_filename] = (
688 `git config --bool rcs.logfilename`.chomp == 'true'
691 fuzz = `git config --int rcs.commitFuzz`.chomp
692 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
694 fuzz = `git config --int rcs.tagFuzz`.chomp
695 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
697 parse_options[:symbol_check] = (
698 `git config --bool rcs.symbolcheck`.chomp == 'false'
701 opts.each do |opt, arg|
703 when '--authors-file'
704 authors = load_authors_file(arg)
705 redef = parse_options[:authors].keys & authors.keys
706 STDERR.puts "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
707 parse_options[:authors].merge!(authors)
708 when '--rcs-suffixes'
710 when '--rcs-commit-fuzz'
711 parse_options[:commit_fuzz] = arg.to_i
712 when '--rcs-tag-fuzz'
713 parse_options[:tag_fuzz] = arg.to_i
714 when '--symbol-check'
715 parse_options[:symbol_check] = true
716 when '--no-symbol-check'
717 parse_options[:symbol_check] = false
718 when '--tag-each-rev'
719 parse_options[:tag_each_rev] = true
720 when '--no-tag-each-rev'
721 # this is the default, which is fine since the missing key
722 # (default) returns nil which is false in Ruby
723 parse_options[:tag_each_rev] = false
724 when '--log-filename'
725 parse_options[:log_filename] = true
726 when '--no-log-filename'
727 # this is the default, which is fine since the missing key
728 # (default) returns nil which is false in Ruby
729 parse_options[:log_filename] = false
730 when '--skip-branches'
731 parse_options[:skip_branches] = true
740 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
741 parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
746 user = Etc.getlogin || ENV['USER']
748 # steal username/email data from other init files that may contain the
752 # the user's .hgrc file for a username field
753 ['~/.hgrc', /^\s*username\s*=\s*(["'])?(.*)\1$/, 2],
754 # the user's .(g)vimrc for a changelog_username setting
755 ['~/.vimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
756 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
757 ].each do |fn, rx, idx|
758 file = File.expand_path fn
759 if File.readable?(file) and File.read(file) =~ rx
760 parse_options[:authors][user] = Regexp.last_match(idx).strip
766 if user and not user.empty? and not parse_options[:authors].has_key?(user)
767 name = ENV['GIT_AUTHOR_NAME'] || ''
768 name.replace(`git config user.name`.chomp) if name.empty?
769 name.replace(Etc.getpwnam(user).gecos) if name.empty?
772 # couldn't find a name, try to steal data from other sources
775 # if we found a name, try to find an email too
776 email = ENV['GIT_AUTHOR_EMAIL'] || ''
777 email.replace(`git config user.email`.chomp) if email.empty?
780 # couldn't find an email, try to steal data too
783 # we got both a name and email, fill the info
784 parse_options[:authors][user] = "#{name} <#{email}>"
799 file_list.each do |arg|
800 case ftype = File.ftype(arg)
806 not_found "RCS file #{arg}"
809 filename = File.basename(arg, SFX)
811 filename = File.basename(arg)
812 path = File.dirname(arg)
813 rcsfile = File.join(path, 'RCS', filename) + SFX
814 unless File.exists? rcsfile
815 rcsfile.replace File.join(path, filename) + SFX
816 unless File.exists? rcsfile
817 not_found "RCS file for #{filename} in #{path}"
821 rcs << RCS.parse(filename, rcsfile)
823 argdirname = arg.chomp(File::SEPARATOR)
824 pattern = File.join(argdirname, '**', '*' + SFX)
825 Dir.glob(pattern).each do |rcsfile|
826 filename = File.basename(rcsfile, SFX)
827 path = File.dirname(rcsfile)
828 # strip trailing "/RCS" if present, or "RCS" if that's
830 path.sub!(/(^|#{File::SEPARATOR})RCS$/, '')
831 # strip off the portion of the path sepecified
832 # on the command line from the front of the path
833 # (or delete the path completely if it is the same
834 # as the specified directory)
835 path.sub!(/^#{Regexp.escape argdirname}(#{File::SEPARATOR}|$)/, '')
836 filename = File.join(path, filename) unless path.empty?
838 rcs << RCS.parse(filename, rcsfile)
839 rescue Exception => e
840 STDERR.puts "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
845 STDERR.puts "Cannot handle #{arg} of #{ftype} type"
851 rcs.first.export_commits(parse_options)
853 STDERR.puts "Preparing commits"
858 r.revision.each do |k, rev|
860 commits << RCS::Commit.new(r, rev)
861 rescue NoBranchSupport
862 if parse_options[:skip_branches]
863 STDERR.puts "Skipping revision #{rev.rev} for #{r.fname} (branch)"
870 STDERR.puts "Sorting by date"
875 STDERR.puts "RAW commits (#{commits.length}):"
877 PP.pp c.to_a, $stderr
880 STDERR.puts "#{commits.length} single-file commits"
883 STDERR.puts "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
885 thisindex = commits.size
886 commits.reverse_each do |c|
887 nextindex = thisindex
890 cfiles = Set.new c.tree.filenames
895 # test for mergeable commits by looking at following commits
896 while nextindex < commits.size
897 k = commits[nextindex]
900 # commits are date-sorted, so we know we can quit early if we are too far
901 # for coalescing to work
902 break if k.min_date > c.max_date + parse_options[:commit_fuzz]
906 kfiles = Set.new k.tree.filenames
908 if c.log != k.log or c.author != k.author or c.branch != k.branch
912 unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
913 cflist = cfiles.to_a.join(', ')
914 kflist = kfiles.to_a.join(', ')
915 if parse_options[:symbol_check]
916 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
917 STDERR.puts "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
918 STDERR.puts "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
921 STDERR.puts "Coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
922 STDERR.puts "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
926 # keep track of filenames touched by commits we are not merging with,
927 # since we don't want to merge with commits that touch them, to preserve
928 # the monotonicity of history for each file
929 # TODO we could forward-merge with them, unless some of our files were
932 # if the candidate touches any file already in the commit,
933 # we can stop looking forward
934 break unless cfiles.intersection(kfiles).empty?
939 # the candidate has the same log, author, branch and appropriate symbols
940 # does it touch anything in ofiles?
941 unless ofiles.intersection(kfiles).empty?
943 cflist = cfiles.to_a.join(', ')
944 kflist = kfiles.to_a.join(', ')
945 oflist = ofiles.to_a.join(', ')
946 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
947 STDERR.puts "\tbecause the latter intersects #{oflist} in #{(ofiles & kfiles).to_a.inspect}"
955 mergeable.each do |k|
958 rescue RuntimeError => err
959 fuzz = c.date - k.date
960 STDERR.puts "Fuzzy commit coalescing failed: #{err}"
961 STDERR.puts "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
969 STDERR.puts "[1] commits (#{commits.length}):"
971 PP.pp c.to_a, $stderr
974 STDERR.puts "#{commits.length} coalesced commits"
977 commits.each { |c| c.export(parse_options) }