5 * Refactor commit coalescing
6 * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
7 * Add support for commitid for coalescing commits
8 * Further coalescing options? (e.g. small logfile differences)
9 * Proper branching support in multi-file export
10 * Optimize memory usage by discarding unneeded text
16 class NoBranchSupport < NotImplementedError ; end
18 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
20 unless 2.respond_to? :odd?
30 #{$0} [options] file [file ...]
32 Fast-export the RCS history of one or more files. If a directory is specified,
33 all RCS-tracked files in the directory and its descendants are exported.
35 When importing single files, their pathname is discarded during import. When
36 importing directories, only the specified directory component is discarded.
38 When importing a single file, RCS commits are converted one by one. Otherwise,
39 some heuristics is used to determine how to coalesce commits touching different
42 Currently, commits are coalesced if they share the exact same log and if their
43 date differs by no more than the user-specified fuzziness. Additionally, the
44 symbols in one of the commit must be a subset of the symbols in the other
45 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
46 false in the git configuration.
49 git init && rcs-fast-export.rb . | git fast-import && git reset
52 --help, -h, -? display this help text
53 --authors-file, -A specify a file containing username = Full Name <email> mappings
54 --rcs-commit-fuzz fuzziness in RCS commits to be considered a single one when
55 importing multiple files
56 (in seconds, defaults to 300, i.e. 5 minutes)
57 --[no-]symbol-check [do not] check symbols when coalescing commits
58 --[no-]tag-each-rev [do not] create a lightweight tag for each RCS revision when
59 importing a single file
60 --[no-]log-filename [do not] prepend the filename to the commit log when importing
62 --skip-branches when exporting multiple files with a branched history, export
63 the main branch only instead of aborting due to the lack of
64 support for branched multi-file history export
69 rcs.authorsFile for --authors-file
70 rcs.tagEachRev for --tag-each-rev
71 rcs.logFilename for --log-filename
72 rcs.commitFuzz for --rcs-commit-fuzz
73 rcs.symbolCheck for --rcs-symbol-check
74 rcs.tagFuzz for --rcs-tag-fuzz
80 STDERR.puts "Could not find #{arg}"
83 # returns a hash that maps usernames to author names & emails
84 def load_authors_file(fn)
87 File.open(File.expand_path(fn)) do |io|
88 io.each_line do |line|
89 uname, author = line.split('=', 2)
92 STDERR.puts "Username #{uname} redefined to #{author}" if hash.has_key? uname
104 fields = string.split('.')
105 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
111 # strip an optional final ;
116 # strip the first and last @, and de-double @@s
117 def RCS.sanitize(arg)
121 raise 'malformed first line' unless ret.first[0,1] == '@'
122 raise 'malformed last line' unless ret.last[-1,1] == '@'
123 ret.first.sub!(/^@/,'')
124 ret.last.sub!(/@$/,'')
125 ret.map { |l| l.gsub('@@','@') }
127 arg.chomp('@').sub(/^@/,'').gsub('@@','@')
134 def RCS.at_clean(arg)
135 RCS.sanitize RCS.clean(arg)
143 @@marks[key] = @@marks.length + 1
147 def RCS.blob(file, rev)
148 RCS.mark([file, rev])
151 def RCS.commit(commit)
156 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
157 def initialize(fname, executable)
162 @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
163 @mode = executable ? '755' : '644'
166 def has_revision?(rev)
167 @revision.has_key?(rev) and not @revision[rev].author.nil?
170 def export_commits(opts={})
173 until @revision.empty?
176 # a string sort is a very good candidate for
177 # export order, getting a miss only for
178 # multi-digit revision components
179 keys = @revision.keys.sort
181 STDERR.puts "commit export loop ##{counter}"
182 STDERR.puts "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
183 STDERR.puts "\t#{keys.size} to export: #{keys.join(', ')}"
187 # the parent commit is rev.next if we're on the
188 # master branch (rev.branch is nil) or
189 # rev.diff_base otherwise
190 from = rev.branch.nil? ? rev.next : rev.diff_base
191 # A commit can only be exported if it has no
192 # parent, or if the parent has been exported
193 # already. Skip this commit otherwise
194 if from and not exported.include? from
198 branch = rev.branch || 'master'
199 author = opts[:authors][rev.author] || "#{rev.author} <empty>"
200 date = "#{rev.date.tv_sec} +0000"
202 if opts[:log_filename]
203 log << @fname << ": "
207 puts "commit refs/heads/#{branch}"
208 puts "mark :#{RCS.commit key}"
209 puts "committer #{author} #{date}"
210 puts "data #{log.length}"
211 puts log unless log.empty?
212 puts "from :#{RCS.commit from}" if rev.branch_point
213 puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
215 # TODO FIXME this *should* be safe, in
216 # that it should not unduly move
217 # branches back in time, but I'm not
219 rev.branches.each do |sym|
220 puts "reset refs/heads/#{sym}"
221 puts "from :#{RCS.commit key}"
223 rev.symbols.each do |sym|
224 puts "reset refs/tags/#{sym}"
225 puts "from :#{RCS.commit key}"
227 if opts[:tag_each_rev]
228 puts "reset refs/tags/#{key}"
229 puts "from :#{RCS.commit key}"
234 exported.each { |k| @revision.delete(k) }
240 attr_accessor :rev, :author, :state, :next
241 attr_accessor :branches, :log, :text, :symbols
242 attr_accessor :branch, :diff_base, :branch_point
244 def initialize(file, rev)
261 @date = Time.rcs(str)
266 ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
271 def RCS.parse(fname, rcsfile)
272 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
274 ::File.open(rcsfile, 'r:ASCII-8BIT') do |file|
279 file.each_line do |line|
282 command, args = line.split($;,2)
283 next if command.empty?
285 if command.chomp!(';')
286 STDERR.puts "Skipping empty command #{command.inspect}" if $DEBUG
292 rcs.head = RCS.clean(args.chomp)
298 rcs.comment = RCS.at_clean(args.chomp)
301 if rcs.has_revision?(rev)
302 status.push :revision_data
304 status.push :new_revision
309 status.push :read_lines
310 when 'branch', 'access', 'locks', 'expand'
311 STDERR.puts "Skipping unhandled command #{command.inspect}" if $DEBUG
312 status.push :skipping_lines
316 raise "Unknown command #{command.inspect}"
319 status.pop if line.strip.chomp!(';')
321 # we can have multiple symbols per line
322 pairs = line.strip.split($;)
324 sym, rev = pair.strip.split(':',2);
326 status.pop if rev.chomp!(';')
327 rcs.revision[rev].symbols << sym
333 rcs.desc.replace lines.dup
336 # we sanitize lines as we read them
338 actual_line = line.dup
340 # the first line must begin with a @, which we strip
342 ats = line.match(/^@+/)
343 raise 'malformed line' unless ats
344 actual_line.replace line.sub(/^@/,'')
347 # if the line ends with an ODD number of @, it's the
348 # last line -- we work on actual_line so that content
349 # such as @\n or @ work correctly (they would be
350 # encoded respectively as ['@@@\n','@\n'] and
352 ats = actual_line.chomp.match(/@+$/)
353 if nomore = (ats && Regexp.last_match(0).length.odd?)
354 actual_line.replace actual_line.chomp.sub(/@$/,'')
356 lines << actual_line.gsub('@@','@')
363 when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
364 rcs.revision[rev].date = $1
365 rcs.revision[rev].author = $2
366 rcs.revision[rev].state = $3
370 status.push :branches
372 line = line.sub(/^branches\s+/,'')
375 when /^next\s+(\S+)?;$/
376 nxt = rcs.revision[rev].next = $1
378 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
379 rcs.revision[nxt].diff_base = rev
380 rcs.revision[nxt].branch = rcs.revision[rev].branch
385 candidate = line.split(';',2)
386 branch = candidate.first.strip
387 rcs.revision[rev].branches << branch
388 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
389 rcs.revision[branch].diff_base = rev
390 # we drop the last number from the branch name
391 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
392 rcs.revision[branch].branch_point = rev
393 status.pop if candidate.length > 1
399 status.push :read_lines
407 status.push :read_lines
412 rcs.revision[rev].log.replace lines.dup
415 rcs.revision[rev].text.replace lines.dup
416 puts rcs.revision[rev].blob
419 difflines.replace lines.dup
420 difflines.pop if difflines.last.empty?
421 base = rcs.revision[rev].diff_base
422 unless rcs.revision[base].text
425 raise 'no diff base!'
429 rcs.revision[base].text.each { |l| buffer << [l.dup] }
435 while l = difflines.shift
437 raise 'negative index during insertion' if index < 0
438 raise 'negative count during insertion' if count < 0
441 # collected all the lines, put the before
446 buffer[index].unshift(*adding)
453 raise 'malformed diff' unless l =~ /^([ad])(\d+) (\d+)$/
459 # for deletion, index 1 is the first index, so the Ruby
460 # index is one less than the diff one
462 # we replace them with empty string so that 'a' commands
463 # referring to the same line work properly
470 # addition will prepend the appropriate lines
471 # to the given index, and in this case Ruby
472 # and diff indices are the same
477 # turn the buffer into an array of lines, deleting the empty ones
478 buffer.delete_if { |l| l.empty? }
481 rcs.revision[rev].text = buffer
482 puts rcs.revision[rev].blob
485 raise "Unknown status #{status.last}"
490 # clean up the symbols/branches: look for revisions that have
491 # one or more symbols but no dates, and make them into
492 # branches, pointing to the highest commit with that key
494 keys = rcs.revision.keys
495 rcs.revision.each do |key, rev|
496 if rev.date.nil? and not rev.symbols.empty?
497 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
498 tr = rcs.revision[top]
499 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
500 tr.branches |= rev.symbols
504 branches.each { |k| rcs.revision.delete k }
510 def initialize(commit)
516 testfiles = @files.dup
517 tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
518 # the next line is only reached if all the adds were
519 # successful, so the merge is atomic
520 @files.replace testfiles
523 def add(rcs, rev, file_list=@files)
524 if file_list.key? rcs
525 prev = file_list[rcs]
526 if prev.log == rev.log
527 str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
529 str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
531 if prev.text != rev.text
534 @commit.warn_about str
546 @files.map do |rcs, rev|
547 if rev.state.downcase == "dead"
548 files << "D #{rcs.fname}"
550 files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
557 @files.map { |rcs, rev| rcs.fname }
566 attr_accessor :date, :log, :symbols, :author, :branch
568 attr_accessor :min_date, :max_date
569 def initialize(rcs, rev)
570 raise NoBranchSupport if rev.branch
571 self.date = rev.date.dup
572 self.min_date = self.max_date = self.date
573 self.log = rev.log.dup
574 self.symbols = rev.symbols.dup
575 self.author = rev.author
576 self.branch = rev.branch
578 self.tree = Tree.new self
579 self.tree.add rcs, rev
583 [self.min_date, self.date, self.max_date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
587 warn str + " for commit on #{self.date}"
590 # Sort by date and then by number of symbols
592 ds = self.date <=> other.date
596 return self.symbols.length <=> other.symbols.length
601 self.tree.merge! commit.tree
602 if commit.max_date > self.max_date
603 self.max_date = commit.max_date
605 if commit.min_date < self.min_date
606 self.min_date = commit.min_date
608 self.symbols.merge commit.symbols
612 xbranch = self.branch || 'master'
613 xauthor = opts[:authors][self.author] || "#{self.author} <empty>"
615 numdate = self.date.tv_sec
616 xdate = "#{numdate} +0000"
619 puts "commit refs/heads/#{xbranch}"
620 puts "mark :#{RCS.commit key}"
621 puts "committer #{xauthor} #{xdate}"
622 puts "data #{xlog.length}"
623 puts xlog unless xlog.empty?
624 # TODO branching support for multi-file export
625 # puts "from :#{RCS.commit from}" if self.branch_point
628 # TODO branching support for multi-file export
629 # rev.branches.each do |sym|
630 # puts "reset refs/heads/#{sym}"
631 # puts "from :#{RCS.commit key}"
634 self.symbols.each do |sym|
635 puts "reset refs/tags/#{sym}"
636 puts "from :#{RCS.commit key}"
645 opts = GetoptLong.new(
646 # Authors file, like git-svn and git-cvsimport, more than one can be
648 ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
649 # RCS file suffix, like RCS
650 ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
651 # Date fuzziness for commits to be considered the same (in seconds)
652 ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
653 # check symbols when coalescing?
654 ['--symbol-check', GetoptLong::NO_ARGUMENT],
655 ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
657 ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
658 ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
659 # prepend filenames to commit logs?
660 ['--log-filename', GetoptLong::NO_ARGUMENT],
661 ['--no-log-filename', GetoptLong::NO_ARGUMENT],
662 # skip branches when exporting a whole tree?
663 ['--skip-branches', GetoptLong::NO_ARGUMENT],
664 ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
667 # We read options in order, but they apply to all passed parameters.
668 # TODO maybe they should only apply to the following, unless there's only one
670 opts.ordering = GetoptLong::RETURN_IN_ORDER
674 :authors => Hash.new,
679 # Read config options
680 `git config --get-all rcs.authorsfile`.each_line do |fn|
681 parse_options[:authors].merge! load_authors_file(fn.chomp)
684 parse_options[:tag_each_rev] = (
685 `git config --bool rcs.tageachrev`.chomp == 'true'
688 parse_options[:log_filename] = (
689 `git config --bool rcs.logfilename`.chomp == 'true'
692 fuzz = `git config --int rcs.commitFuzz`.chomp
693 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
695 fuzz = `git config --int rcs.tagFuzz`.chomp
696 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
698 parse_options[:symbol_check] = (
699 `git config --bool rcs.symbolcheck`.chomp == 'false'
702 opts.each do |opt, arg|
704 when '--authors-file'
705 authors = load_authors_file(arg)
706 redef = parse_options[:authors].keys & authors.keys
707 STDERR.puts "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
708 parse_options[:authors].merge!(authors)
709 when '--rcs-suffixes'
711 when '--rcs-commit-fuzz'
712 parse_options[:commit_fuzz] = arg.to_i
713 when '--rcs-tag-fuzz'
714 parse_options[:tag_fuzz] = arg.to_i
715 when '--symbol-check'
716 parse_options[:symbol_check] = true
717 when '--no-symbol-check'
718 parse_options[:symbol_check] = false
719 when '--tag-each-rev'
720 parse_options[:tag_each_rev] = true
721 when '--no-tag-each-rev'
722 # this is the default, which is fine since the missing key
723 # (default) returns nil which is false in Ruby
724 parse_options[:tag_each_rev] = false
725 when '--log-filename'
726 parse_options[:log_filename] = true
727 when '--no-log-filename'
728 # this is the default, which is fine since the missing key
729 # (default) returns nil which is false in Ruby
730 parse_options[:log_filename] = false
731 when '--skip-branches'
732 parse_options[:skip_branches] = true
741 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
742 parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
747 user = Etc.getlogin || ENV['USER']
749 # steal username/email data from other init files that may contain the
753 # the user's .hgrc file for a username field
754 ['~/.hgrc', /^\s*username\s*=\s*(["'])?(.*)\1$/, 2],
755 # the user's .(g)vimrc for a changelog_username setting
756 ['~/.vimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
757 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
758 ].each do |fn, rx, idx|
759 file = File.expand_path fn
760 if File.readable?(file) and File.read(file) =~ rx
761 parse_options[:authors][user] = Regexp.last_match(idx).strip
767 if user and not user.empty? and not parse_options[:authors].has_key?(user)
768 name = ENV['GIT_AUTHOR_NAME'] || ''
769 name.replace(`git config user.name`.chomp) if name.empty?
770 name.replace(Etc.getpwnam(user).gecos) if name.empty?
773 # couldn't find a name, try to steal data from other sources
776 # if we found a name, try to find an email too
777 email = ENV['GIT_AUTHOR_EMAIL'] || ''
778 email.replace(`git config user.email`.chomp) if email.empty?
781 # couldn't find an email, try to steal data too
784 # we got both a name and email, fill the info
785 parse_options[:authors][user] = "#{name} <#{email}>"
800 file_list.each do |arg|
801 case ftype = File.ftype(arg)
807 not_found "RCS file #{arg}"
810 filename = File.basename(arg, SFX)
812 filename = File.basename(arg)
813 path = File.dirname(arg)
814 rcsfile = File.join(path, 'RCS', filename) + SFX
815 unless File.exists? rcsfile
816 rcsfile.replace File.join(path, filename) + SFX
817 unless File.exists? rcsfile
818 not_found "RCS file for #{filename} in #{path}"
822 rcs << RCS.parse(filename, rcsfile)
824 argdirname = arg.chomp(File::SEPARATOR)
825 pattern = File.join(argdirname, '**', '*' + SFX)
826 Dir.glob(pattern).each do |rcsfile|
827 filename = File.basename(rcsfile, SFX)
828 path = File.dirname(rcsfile)
829 # strip trailing "/RCS" if present, or "RCS" if that's
831 path.sub!(/(^|#{File::SEPARATOR})RCS$/, '')
832 # strip off the portion of the path sepecified
833 # on the command line from the front of the path
834 # (or delete the path completely if it is the same
835 # as the specified directory)
836 path.sub!(/^#{Regexp.escape argdirname}(#{File::SEPARATOR}|$)/, '')
837 filename = File.join(path, filename) unless path.empty?
839 rcs << RCS.parse(filename, rcsfile)
840 rescue Exception => e
841 STDERR.puts "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
846 STDERR.puts "Cannot handle #{arg} of #{ftype} type"
852 rcs.first.export_commits(parse_options)
854 STDERR.puts "Preparing commits"
859 r.revision.each do |k, rev|
861 commits << RCS::Commit.new(r, rev)
862 rescue NoBranchSupport
863 if parse_options[:skip_branches]
864 STDERR.puts "Skipping revision #{rev.rev} for #{r.fname} (branch)"
871 STDERR.puts "Sorting by date"
876 STDERR.puts "RAW commits (#{commits.length}):"
878 PP.pp c.to_a, $stderr
881 STDERR.puts "#{commits.length} single-file commits"
884 STDERR.puts "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
886 thisindex = commits.size
887 commits.reverse_each do |c|
888 nextindex = thisindex
891 cfiles = Set.new c.tree.filenames
896 # test for mergeable commits by looking at following commits
897 while nextindex < commits.size
898 k = commits[nextindex]
901 # commits are date-sorted, so we know we can quit early if we are too far
902 # for coalescing to work
903 break if k.min_date > c.max_date + parse_options[:commit_fuzz]
907 kfiles = Set.new k.tree.filenames
909 if c.log != k.log or c.author != k.author or c.branch != k.branch
913 unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
914 cflist = cfiles.to_a.join(', ')
915 kflist = kfiles.to_a.join(', ')
916 if parse_options[:symbol_check]
917 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
918 STDERR.puts "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
919 STDERR.puts "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
922 STDERR.puts "Coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
923 STDERR.puts "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
927 # keep track of filenames touched by commits we are not merging with,
928 # since we don't want to merge with commits that touch them, to preserve
929 # the monotonicity of history for each file
930 # TODO we could forward-merge with them, unless some of our files were
933 # if the candidate touches any file already in the commit,
934 # we can stop looking forward
935 break unless cfiles.intersection(kfiles).empty?
940 # the candidate has the same log, author, branch and appropriate symbols
941 # does it touch anything in ofiles?
942 unless ofiles.intersection(kfiles).empty?
944 cflist = cfiles.to_a.join(', ')
945 kflist = kfiles.to_a.join(', ')
946 oflist = ofiles.to_a.join(', ')
947 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
948 STDERR.puts "\tbecause the latter intersects #{oflist} in #{(ofiles & kfiles).to_a.inspect}"
956 mergeable.each do |k|
959 rescue RuntimeError => err
960 fuzz = c.date - k.date
961 STDERR.puts "Fuzzy commit coalescing failed: #{err}"
962 STDERR.puts "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
970 STDERR.puts "[1] commits (#{commits.length}):"
972 PP.pp c.to_a, $stderr
975 STDERR.puts "#{commits.length} coalesced commits"
978 commits.each { |c| c.export(parse_options) }