5 * Refactor commit coalescing
6 * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
7 * Further coalescing options? (e.g. small logfile differences)
8 * Proper branching support in multi-file export
9 * Optimize memory usage by discarding unneeded text
15 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
17 unless 2.respond_to? :odd?
27 #{$0} [options] file [file ...]
29 Fast-export the RCS history of one or more files. If a directory is specified,
30 all RCS-tracked files in the directory and its descendants are exported.
32 When importing single files, their pathname is discarded during import. When
33 importing directories, only the specified directory component is discarded.
35 When importing a single file, RCS commits are converted one by one. Otherwise,
36 some heuristics is used to determine how to coalesce commits touching different
39 Currently, commits are coalesced if they share the exact same log and if their
40 date differs by no more than the user-specified fuzziness. Additionally, the
41 symbols in one of the commit must be a subset of the symbols in the other
42 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
43 false in the git configuration.
46 git init && rcs-fast-export.rb . | git fast-import && git reset
49 --help, -h, -? display this help text
50 --authors-file, -A specify a file containing username = Full Name <email> mappings
51 --rcs-commit-fuzz fuzziness in RCS commits to be considered a single one when
52 importing multiple files
53 (in seconds, defaults to 300, i.e. 5 minutes)
54 --[no-]symbol-check [do not] check symbols when coalescing commits
55 --[no-]tag-each-rev [do not] create a lightweight tag for each RCS revision when
56 importing a single file
57 --[no-]log-filename [do not] prepend the filename to the commit log when importing
61 rcs.authorsFile for --authors-file
62 rcs.tagEachRev for --tag-each-rev
63 rcs.logFilename for --log-filename
64 rcs.commitFuzz for --rcs-commit-fuzz
65 rcs.symbolCheck for --rcs-symbol-check
66 rcs.tagFuzz for --rcs-tag-fuzz
72 STDERR.puts "Could not find #{arg}"
75 # returns a hash that maps usernames to author names & emails
76 def load_authors_file(fn)
79 File.open(File.expand_path(fn)) do |io|
80 io.each_line do |line|
81 uname, author = line.split('=', 2)
84 STDERR.puts "Username #{uname} redefined to #{author}" if hash.has_key? uname
96 fields = string.split('.')
97 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
103 # strip an optional final ;
108 # strip the first and last @, and de-double @@s
109 def RCS.sanitize(arg)
113 raise 'malformed first line' unless ret.first[0,1] == '@'
114 raise 'malformed last line' unless ret.last[-1,1] == '@'
115 ret.first.sub!(/^@/,'')
116 ret.last.sub!(/@$/,'')
117 ret.map { |l| l.gsub('@@','@') }
119 arg.chomp('@').sub(/^@/,'').gsub('@@','@')
126 def RCS.at_clean(arg)
127 RCS.sanitize RCS.clean(arg)
135 @@marks[key] = @@marks.length + 1
139 def RCS.blob(file, rev)
140 RCS.mark([file, rev])
143 def RCS.commit(commit)
148 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
149 def initialize(fname, executable)
154 @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
155 @mode = executable ? '755' : '644'
158 def has_revision?(rev)
159 @revision.has_key?(rev) and not @revision[rev].author.nil?
162 def export_commits(opts={})
165 until @revision.empty?
168 # a string sort is a very good candidate for
169 # export order, getting a miss only for
170 # multi-digit revision components
171 keys = @revision.keys.sort
173 STDERR.puts "commit export loop ##{counter}"
174 STDERR.puts "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
175 STDERR.puts "\t#{keys.size} to export: #{keys.join(', ')}"
179 # the parent commit is rev.next if we're on the
180 # master branch (rev.branch is nil) or
181 # rev.diff_base otherwise
182 from = rev.branch.nil? ? rev.next : rev.diff_base
183 # A commit can only be exported if it has no
184 # parent, or if the parent has been exported
185 # already. Skip this commit otherwise
186 if from and not exported.include? from
190 branch = rev.branch || 'master'
191 author = opts[:authors][rev.author] || "#{rev.author} <empty>"
192 date = "#{rev.date.tv_sec} +0000"
194 if opts[:log_filename]
195 log << @fname << ": "
199 puts "commit refs/heads/#{branch}"
200 puts "mark :#{RCS.commit key}"
201 puts "committer #{author} #{date}"
202 puts "data #{log.length}"
203 puts log unless log.empty?
204 puts "from :#{RCS.commit from}" if rev.branch_point
205 puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
207 # TODO FIXME this *should* be safe, in
208 # that it should not unduly move
209 # branches back in time, but I'm not
211 rev.branches.each do |sym|
212 puts "reset refs/heads/#{sym}"
213 puts "from :#{RCS.commit key}"
215 rev.symbols.each do |sym|
216 puts "reset refs/tags/#{sym}"
217 puts "from :#{RCS.commit key}"
219 if opts[:tag_each_rev]
220 puts "reset refs/tags/#{key}"
221 puts "from :#{RCS.commit key}"
226 exported.each { |k| @revision.delete(k) }
232 attr_accessor :rev, :author, :state, :next
233 attr_accessor :branches, :log, :text, :symbols
234 attr_accessor :branch, :diff_base, :branch_point
236 def initialize(file, rev)
253 @date = Time.rcs(str)
258 ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
263 def RCS.parse(fname, rcsfile)
264 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
266 ::File.open(rcsfile, 'r') do |file|
271 file.each_line do |line|
274 command, args = line.split($;,2)
275 next if command.empty?
277 if command.chomp!(';')
278 STDERR.puts "Skipping empty command #{command.inspect}" if $DEBUG
284 rcs.head = RCS.clean(args.chomp)
290 rcs.comment = RCS.at_clean(args.chomp)
293 if rcs.has_revision?(rev)
294 status.push :revision_data
296 status.push :new_revision
301 status.push :read_lines
302 when 'branch', 'access', 'locks', 'expand'
303 STDERR.puts "Skipping unhandled command #{command.inspect}" if $DEBUG
304 status.push :skipping_lines
308 raise "Unknown command #{command.inspect}"
311 status.pop if line.strip.chomp!(';')
313 # we can have multiple symbols per line
314 pairs = line.strip.split($;)
316 sym, rev = pair.strip.split(':',2);
318 status.pop if rev.chomp!(';')
319 rcs.revision[rev].symbols << sym
325 rcs.desc.replace lines.dup
328 # we sanitize lines as we read them
330 actual_line = line.dup
332 # the first line must begin with a @, which we strip
334 ats = line.match(/^@+/)
335 raise 'malformed line' unless ats
336 actual_line.replace line.sub(/^@/,'')
339 # if the line ends with an ODD number of @, it's the
340 # last line -- we work on actual_line so that content
341 # such as @\n or @ work correctly (they would be
342 # encoded respectively as ['@@@\n','@\n'] and
344 ats = actual_line.chomp.match(/@+$/)
345 if nomore = (ats && Regexp.last_match(0).length.odd?)
346 actual_line.replace actual_line.chomp.sub(/@$/,'')
348 lines << actual_line.gsub('@@','@')
355 when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
356 rcs.revision[rev].date = $1
357 rcs.revision[rev].author = $2
358 rcs.revision[rev].state = $3
360 status.push :branches
363 when /^next\s+(\S+)?;$/
364 nxt = rcs.revision[rev].next = $1
366 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
367 rcs.revision[nxt].diff_base = rev
368 rcs.revision[nxt].branch = rcs.revision[rev].branch
373 candidate = line.split(';',2)
374 branch = candidate.first.strip
375 rcs.revision[rev].branches.push branch
376 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
377 rcs.revision[branch].diff_base = rev
378 # we drop the last number from the branch name
379 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
380 rcs.revision[branch].branch_point = rev
381 status.pop if candidate.length > 1
387 status.push :read_lines
395 status.push :read_lines
400 rcs.revision[rev].log.replace lines.dup
403 rcs.revision[rev].text.replace lines.dup
404 puts rcs.revision[rev].blob
407 difflines.replace lines.dup
408 difflines.pop if difflines.last.empty?
409 base = rcs.revision[rev].diff_base
410 unless rcs.revision[base].text
413 raise 'no diff base!'
417 rcs.revision[base].text.each { |l| buffer << [l.dup] }
423 while l = difflines.shift
425 raise 'negative index during insertion' if index < 0
426 raise 'negative count during insertion' if count < 0
429 # collected all the lines, put the before
434 buffer[index].unshift(*adding)
441 raise 'malformed diff' unless l =~ /^([ad])(\d+) (\d+)$/
447 # for deletion, index 1 is the first index, so the Ruby
448 # index is one less than the diff one
450 # we replace them with empty string so that 'a' commands
451 # referring to the same line work properly
458 # addition will prepend the appropriate lines
459 # to the given index, and in this case Ruby
460 # and diff indices are the same
465 # turn the buffer into an array of lines, deleting the empty ones
466 buffer.delete_if { |l| l.empty? }
469 rcs.revision[rev].text = buffer
470 puts rcs.revision[rev].blob
473 raise "Unknown status #{status.last}"
478 # clean up the symbols/branches: look for revisions that have
479 # one or more symbols but no dates, and make them into
480 # branches, pointing to the highest commit with that key
482 keys = rcs.revision.keys
483 rcs.revision.each do |key, rev|
484 if rev.date.nil? and not rev.symbols.empty?
485 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
486 tr = rcs.revision[top]
487 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
488 tr.branches |= rev.symbols
492 branches.each { |k| rcs.revision.delete k }
498 def initialize(commit)
504 testfiles = @files.dup
505 tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
506 # the next line is only reached if all the adds were
507 # successful, so the merge is atomic
508 @files.replace testfiles
511 def add(rcs, rev, file_list=@files)
512 if file_list.key? rcs
513 prev = file_list[rcs]
514 if prev.log == rev.log
515 str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
517 str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
519 if prev.text != rev.text
522 @commit.warn_about str
534 @files.map do |rcs, rev|
535 files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
541 @files.map { |rcs, rev| rcs.fname }
550 attr_accessor :date, :log, :symbols, :author, :branch
552 def initialize(rcs, rev)
553 raise NotImplementedError if rev.branch
554 self.date = rev.date.dup
555 self.log = rev.log.dup
556 self.symbols = rev.symbols.dup
557 self.author = rev.author
558 self.branch = rev.branch
560 self.tree = Tree.new self
561 self.tree.add rcs, rev
565 [self.date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
569 warn str + " for commit on #{self.date}"
572 # Sort by date and then by number of symbols
574 ds = self.date <=> other.date
578 return self.symbols.length <=> other.symbols.length
583 self.tree.merge! commit.tree
584 if commit.date > self.date
585 warn_about "updating date to #{commit.date}"
586 self.date = commit.date
588 self.symbols.merge commit.symbols
592 xbranch = self.branch || 'master'
593 xauthor = opts[:authors][self.author] || "#{self.author} <empty>"
595 numdate = self.date.tv_sec
596 xdate = "#{numdate} +0000"
599 puts "commit refs/heads/#{xbranch}"
600 puts "mark :#{RCS.commit key}"
601 puts "committer #{xauthor} #{xdate}"
602 puts "data #{xlog.length}"
603 puts xlog unless xlog.empty?
604 # TODO branching support for multi-file export
605 # puts "from :#{RCS.commit from}" if self.branch_point
608 # TODO branching support for multi-file export
609 # rev.branches.each do |sym|
610 # puts "reset refs/heads/#{sym}"
611 # puts "from :#{RCS.commit key}"
614 self.symbols.each do |sym|
615 puts "reset refs/tags/#{sym}"
616 puts "from :#{RCS.commit key}"
625 opts = GetoptLong.new(
626 # Authors file, like git-svn and git-cvsimport, more than one can be
628 ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
629 # RCS file suffix, like RCS
630 ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
631 # Date fuzziness for commits to be considered the same (in seconds)
632 ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
633 # check symbols when coalescing?
634 ['--symbol-check', GetoptLong::NO_ARGUMENT],
635 ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
637 ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
638 ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
639 # prepend filenames to commit logs?
640 ['--log-filename', GetoptLong::NO_ARGUMENT],
641 ['--no-log-filename', GetoptLong::NO_ARGUMENT],
642 ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
645 # We read options in order, but they apply to all passed parameters.
646 # TODO maybe they should only apply to the following, unless there's only one
648 opts.ordering = GetoptLong::RETURN_IN_ORDER
652 :authors => Hash.new,
657 # Read config options
658 `git config --get-all rcs.authorsfile`.each_line do |fn|
659 parse_options[:authors].merge! load_authors_file(fn.chomp)
662 parse_options[:tag_each_rev] = (
663 `git config --bool rcs.tageachrev`.chomp == 'true'
666 parse_options[:log_filename] = (
667 `git config --bool rcs.logfilename`.chomp == 'true'
670 fuzz = `git config --int rcs.commitFuzz`.chomp
671 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
673 fuzz = `git config --int rcs.tagFuzz`.chomp
674 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
676 parse_options[:symbol_check] = (
677 `git config --bool rcs.symbolcheck`.chomp == 'false'
680 opts.each do |opt, arg|
682 when '--authors-file'
683 authors = load_authors_file(arg)
684 redef = parse_options[:authors].keys & authors.keys
685 STDERR.puts "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
686 parse_options[:authors].merge!(authors)
687 when '--rcs-suffixes'
689 when '--rcs-commit-fuzz'
690 parse_options[:commit_fuzz] = arg.to_i
691 when '--rcs-tag-fuzz'
692 parse_options[:tag_fuzz] = arg.to_i
693 when '--symbol-check'
694 parse_options[:symbol_check] = true
695 when '--no-symbol-check'
696 parse_options[:symbol_check] = false
697 when '--tag-each-rev'
698 parse_options[:tag_each_rev] = true
699 when '--no-tag-each-rev'
700 # this is the default, which is fine since the missing key
701 # (default) returns nil which is false in Ruby
702 parse_options[:tag_each_rev] = false
703 when '--log-filename'
704 parse_options[:log_filename] = true
705 when '--no-log-filename'
706 # this is the default, which is fine since the missing key
707 # (default) returns nil which is false in Ruby
708 parse_options[:log_filename] = false
717 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
718 parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
723 user = Etc.getlogin || ENV['USER']
725 # steal username/email data from other init files that may contain the
729 # the user's .hgrc file for a username field
730 ['~/.hgrc', /^\s*username\s*=\s*(["'])?(.*)\1$/, 2],
731 # the user's .(g)vimrc for a changelog_username setting
732 ['~/.vimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
733 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
735 ].each do |fn, rx, idx|
736 file = File.expand_path fn
737 if File.readable?(file) and File.read(file) =~ rx
738 parse_options[:authors][user] = Regexp.last_match(idx).strip
744 if user and not user.empty? and not parse_options[:authors].has_key?(user)
745 name = ENV['GIT_AUTHOR_NAME'] || ''
746 name.replace(`git config user.name`.chomp) if name.empty?
747 name.replace(Etc.getpwnam(user).gecos) if name.empty?
750 # couldn't find a name, try to steal data from other sources
753 # if we found a name, try to find an email too
754 email = ENV['GIT_AUTHOR_EMAIL'] || ''
755 email.replace(`git config user.email`.chomp) if email.empty?
758 # couldn't find an email, try to steal data too
761 # we got both a name and email, fill the info
762 parse_options[:authors][user] = "#{name} <#{email}>"
777 file_list.each do |arg|
778 case ftype = File.ftype(arg)
784 not_found "RCS file #{arg}"
787 filename = File.basename(arg, SFX)
789 filename = File.basename(arg)
790 path = File.dirname(arg)
791 rcsfile = File.join(path, 'RCS', filename) + SFX
792 unless File.exists? rcsfile
793 rcsfile.replace File.join(path, filename) + SFX
794 unless File.exists? rcsfile
795 not_found "RCS file for #{filename} in #{path}"
799 rcs << RCS.parse(filename, rcsfile)
801 pattern = File.join(arg, '**', '*' + SFX)
802 Dir.glob(pattern).each do |rcsfile|
803 filename = File.basename(rcsfile, SFX)
804 path = File.dirname(rcsfile)
805 path.sub!(/\/?RCS$/, '') # strip final /RCS if present
806 path.sub!(/^#{Regexp.escape arg}\/?/, '') # strip initial dirname
807 filename = File.join(path, filename) unless path.empty?
809 rcs << RCS.parse(filename, rcsfile)
810 rescue Exception => e
811 STDERR.puts "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
816 STDERR.puts "Cannot handle #{arg} of #{ftype} type"
822 rcs.first.export_commits(parse_options)
824 STDERR.puts "Preparing commits"
829 r.revision.each do |k, rev|
830 commits << RCS::Commit.new(r, rev)
834 STDERR.puts "Sorting by date"
839 STDERR.puts "RAW commits (#{commits.length}):"
841 PP.pp c.to_a, $stderr
844 STDERR.puts "#{commits.length} single-file commits"
847 STDERR.puts "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
849 commits.reverse_each do |c|
850 commits.reverse_each do |k|
851 break if k.date < c.date - parse_options[:commit_fuzz]
853 next if c.log != k.log or c.author != k.author or c.branch != k.branch
854 next if k.date > c.date
855 unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
856 if parse_options[:symbol_check]
857 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{c.tree.filenames.join(', ')})\n\tand (#{k.tree.filenames.join(', ')})"
858 STDERR.puts "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
859 STDERR.puts "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
862 STDERR.puts "Coalescing #{c.log.inspect}\n\tfor (#{c.tree.filenames.join(', ')})\n\tand (#{k.tree.filenames.join(', ')})"
863 STDERR.puts "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
868 rescue RuntimeError => err
869 fuzz = c.date - k.date
870 STDERR.puts "Fuzzy commit coalescing failed: #{err}"
871 STDERR.puts "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
879 STDERR.puts "[1] commits (#{commits.length}):"
881 PP.pp c.to_a, $stderr
884 STDERR.puts "#{commits.length} coalesced commits"
887 commits.each { |c| c.export(parse_options) }