5 * Refactor commit coalescing
6 * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
7 * Add support for commitid for coalescing commits
8 * Further coalescing options? (e.g. small logfile differences)
9 * Proper branching support in multi-file export
10 * Optimize memory usage by discarding unneeded text
16 class NoBranchSupport < NotImplementedError ; end
18 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
20 unless 2.respond_to? :odd?
30 #{$0} [options] file [file ...]
32 Fast-export the RCS history of one or more files. If a directory is specified,
33 all RCS-tracked files in the directory and its descendants are exported.
35 When importing single files, their pathname is discarded during import. When
36 importing directories, only the specified directory component is discarded.
38 When importing a single file, RCS commits are converted one by one. Otherwise,
39 some heuristics is used to determine how to coalesce commits touching different
42 Currently, commits are coalesced if they share the exact same log and if their
43 date differs by no more than the user-specified fuzziness. Additionally, the
44 symbols in one of the commit must be a subset of the symbols in the other
45 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
46 false in the git configuration.
49 git init && rcs-fast-export.rb . | git fast-import && git reset
52 --help, -h, -? display this help text
53 --authors-file, -A specify a file containing username = Full Name <email> mappings
54 --rcs-commit-fuzz fuzziness in RCS commits to be considered a single one when
55 importing multiple files
56 (in seconds, defaults to 300, i.e. 5 minutes)
57 --[no-]symbol-check [do not] check symbols when coalescing commits
58 --[no-]tag-each-rev [do not] create a lightweight tag for each RCS revision when
59 importing a single file
60 --[no-]log-filename [do not] prepend the filename to the commit log when importing
62 --skip-branches when exporting multiple files with a branched history, export
63 the main branch only instead of aborting due to the lack of
64 support for branched multi-file history export
69 rcs.authorsFile for --authors-file
70 rcs.tagEachRev for --tag-each-rev
71 rcs.logFilename for --log-filename
72 rcs.commitFuzz for --rcs-commit-fuzz
73 rcs.symbolCheck for --rcs-symbol-check
74 rcs.tagFuzz for --rcs-tag-fuzz
80 STDERR.puts "Could not find #{arg}"
83 # returns a hash that maps usernames to author names & emails
84 def load_authors_file(fn)
87 File.open(File.expand_path(fn)) do |io|
88 io.each_line do |line|
89 uname, author = line.split('=', 2)
92 STDERR.puts "Username #{uname} redefined to #{author}" if hash.has_key? uname
104 fields = string.split('.')
105 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
111 # strip an optional final ;
116 # strip the first and last @, and de-double @@s
117 def RCS.sanitize(arg)
121 raise 'malformed first line' unless ret.first[0,1] == '@'
122 raise 'malformed last line' unless ret.last[-1,1] == '@'
123 ret.first.sub!(/^@/,'')
124 ret.last.sub!(/@$/,'')
125 ret.map { |l| l.gsub('@@','@') }
127 arg.chomp('@').sub(/^@/,'').gsub('@@','@')
134 def RCS.at_clean(arg)
135 RCS.sanitize RCS.clean(arg)
143 @@marks[key] = @@marks.length + 1
147 def RCS.blob(file, rev)
148 RCS.mark([file, rev])
151 def RCS.commit(commit)
156 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
157 def initialize(fname, executable)
162 @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
163 @mode = executable ? '755' : '644'
166 def has_revision?(rev)
167 @revision.has_key?(rev) and not @revision[rev].author.nil?
170 def export_commits(opts={})
173 until @revision.empty?
176 # a string sort is a very good candidate for
177 # export order, getting a miss only for
178 # multi-digit revision components
179 keys = @revision.keys.sort
181 STDERR.puts "commit export loop ##{counter}"
182 STDERR.puts "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
183 STDERR.puts "\t#{keys.size} to export: #{keys.join(', ')}"
187 # the parent commit is rev.next if we're on the
188 # master branch (rev.branch is nil) or
189 # rev.diff_base otherwise
190 from = rev.branch.nil? ? rev.next : rev.diff_base
191 # A commit can only be exported if it has no
192 # parent, or if the parent has been exported
193 # already. Skip this commit otherwise
194 if from and not exported.include? from
198 branch = rev.branch || 'master'
199 author = opts[:authors][rev.author] || "#{rev.author} <empty>"
200 date = "#{rev.date.tv_sec} +0000"
202 if opts[:log_filename]
203 log << @fname << ": "
207 puts "commit refs/heads/#{branch}"
208 puts "mark :#{RCS.commit key}"
209 puts "committer #{author} #{date}"
210 puts "data #{log.length}"
211 puts log unless log.empty?
212 puts "from :#{RCS.commit from}" if rev.branch_point
213 puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
215 # TODO FIXME this *should* be safe, in
216 # that it should not unduly move
217 # branches back in time, but I'm not
219 rev.branches.each do |sym|
220 puts "reset refs/heads/#{sym}"
221 puts "from :#{RCS.commit key}"
223 rev.symbols.each do |sym|
224 puts "reset refs/tags/#{sym}"
225 puts "from :#{RCS.commit key}"
227 if opts[:tag_each_rev]
228 puts "reset refs/tags/#{key}"
229 puts "from :#{RCS.commit key}"
234 exported.each { |k| @revision.delete(k) }
240 attr_accessor :rev, :author, :state, :next
241 attr_accessor :branches, :log, :text, :symbols
242 attr_accessor :branch, :diff_base, :branch_point
244 def initialize(file, rev)
261 @date = Time.rcs(str)
266 ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
271 def RCS.parse(fname, rcsfile)
272 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
274 ::File.open(rcsfile, 'r:ASCII-8BIT') do |file|
279 file.each_line do |line|
282 command, args = line.split($;,2)
283 next if command.empty?
285 if command.chomp!(';')
286 STDERR.puts "Skipping empty command #{command.inspect}" if $DEBUG
292 rcs.head = RCS.clean(args.chomp)
298 rcs.comment = RCS.at_clean(args.chomp)
301 if rcs.has_revision?(rev)
302 status.push :revision_data
304 status.push :new_revision
309 status.push :read_lines
310 when 'branch', 'access', 'locks', 'expand'
311 STDERR.puts "Skipping unhandled command #{command.inspect}" if $DEBUG
312 status.push :skipping_lines
316 raise "Unknown command #{command.inspect}"
319 status.pop if line.strip.chomp!(';')
321 # we can have multiple symbols per line
322 pairs = line.strip.split($;)
324 sym, rev = pair.strip.split(':',2);
326 status.pop if rev.chomp!(';')
327 rcs.revision[rev].symbols << sym
333 rcs.desc.replace lines.dup
336 # we sanitize lines as we read them
338 actual_line = line.dup
340 # the first line must begin with a @, which we strip
342 ats = line.match(/^@+/)
343 raise 'malformed line' unless ats
344 actual_line.replace line.sub(/^@/,'')
347 # if the line ends with an ODD number of @, it's the
348 # last line -- we work on actual_line so that content
349 # such as @\n or @ work correctly (they would be
350 # encoded respectively as ['@@@\n','@\n'] and
352 ats = actual_line.chomp.match(/@+$/)
353 if nomore = (ats && Regexp.last_match(0).length.odd?)
354 actual_line.replace actual_line.chomp.sub(/@$/,'')
356 lines << actual_line.gsub('@@','@')
363 when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
364 rcs.revision[rev].date = $1
365 rcs.revision[rev].author = $2
366 rcs.revision[rev].state = $3
368 status.push :branches
371 when /^next\s+(\S+)?;$/
372 nxt = rcs.revision[rev].next = $1
374 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
375 rcs.revision[nxt].diff_base = rev
376 rcs.revision[nxt].branch = rcs.revision[rev].branch
381 candidate = line.split(';',2)
382 branch = candidate.first.strip
383 rcs.revision[rev].branches << branch
384 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
385 rcs.revision[branch].diff_base = rev
386 # we drop the last number from the branch name
387 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
388 rcs.revision[branch].branch_point = rev
389 status.pop if candidate.length > 1
395 status.push :read_lines
403 status.push :read_lines
408 rcs.revision[rev].log.replace lines.dup
411 rcs.revision[rev].text.replace lines.dup
412 puts rcs.revision[rev].blob
415 difflines.replace lines.dup
416 difflines.pop if difflines.last.empty?
417 base = rcs.revision[rev].diff_base
418 unless rcs.revision[base].text
421 raise 'no diff base!'
425 rcs.revision[base].text.each { |l| buffer << [l.dup] }
431 while l = difflines.shift
433 raise 'negative index during insertion' if index < 0
434 raise 'negative count during insertion' if count < 0
437 # collected all the lines, put the before
442 buffer[index].unshift(*adding)
449 raise 'malformed diff' unless l =~ /^([ad])(\d+) (\d+)$/
455 # for deletion, index 1 is the first index, so the Ruby
456 # index is one less than the diff one
458 # we replace them with empty string so that 'a' commands
459 # referring to the same line work properly
466 # addition will prepend the appropriate lines
467 # to the given index, and in this case Ruby
468 # and diff indices are the same
473 # turn the buffer into an array of lines, deleting the empty ones
474 buffer.delete_if { |l| l.empty? }
477 rcs.revision[rev].text = buffer
478 puts rcs.revision[rev].blob
481 raise "Unknown status #{status.last}"
486 # clean up the symbols/branches: look for revisions that have
487 # one or more symbols but no dates, and make them into
488 # branches, pointing to the highest commit with that key
490 keys = rcs.revision.keys
491 rcs.revision.each do |key, rev|
492 if rev.date.nil? and not rev.symbols.empty?
493 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
494 tr = rcs.revision[top]
495 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
496 tr.branches |= rev.symbols
500 branches.each { |k| rcs.revision.delete k }
506 def initialize(commit)
512 testfiles = @files.dup
513 tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
514 # the next line is only reached if all the adds were
515 # successful, so the merge is atomic
516 @files.replace testfiles
519 def add(rcs, rev, file_list=@files)
520 if file_list.key? rcs
521 prev = file_list[rcs]
522 if prev.log == rev.log
523 str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
525 str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
527 if prev.text != rev.text
530 @commit.warn_about str
542 @files.map do |rcs, rev|
543 files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
549 @files.map { |rcs, rev| rcs.fname }
558 attr_accessor :date, :log, :symbols, :author, :branch
560 def initialize(rcs, rev)
561 raise NoBranchSupport if rev.branch
562 self.date = rev.date.dup
563 self.log = rev.log.dup
564 self.symbols = rev.symbols.dup
565 self.author = rev.author
566 self.branch = rev.branch
568 self.tree = Tree.new self
569 self.tree.add rcs, rev
573 [self.date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
577 warn str + " for commit on #{self.date}"
580 # Sort by date and then by number of symbols
582 ds = self.date <=> other.date
586 return self.symbols.length <=> other.symbols.length
591 self.tree.merge! commit.tree
592 if commit.date > self.date
593 warn_about "updating date to #{commit.date}"
594 self.date = commit.date
596 self.symbols.merge commit.symbols
600 xbranch = self.branch || 'master'
601 xauthor = opts[:authors][self.author] || "#{self.author} <empty>"
603 numdate = self.date.tv_sec
604 xdate = "#{numdate} +0000"
607 puts "commit refs/heads/#{xbranch}"
608 puts "mark :#{RCS.commit key}"
609 puts "committer #{xauthor} #{xdate}"
610 puts "data #{xlog.length}"
611 puts xlog unless xlog.empty?
612 # TODO branching support for multi-file export
613 # puts "from :#{RCS.commit from}" if self.branch_point
616 # TODO branching support for multi-file export
617 # rev.branches.each do |sym|
618 # puts "reset refs/heads/#{sym}"
619 # puts "from :#{RCS.commit key}"
622 self.symbols.each do |sym|
623 puts "reset refs/tags/#{sym}"
624 puts "from :#{RCS.commit key}"
633 opts = GetoptLong.new(
634 # Authors file, like git-svn and git-cvsimport, more than one can be
636 ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
637 # RCS file suffix, like RCS
638 ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
639 # Date fuzziness for commits to be considered the same (in seconds)
640 ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
641 # check symbols when coalescing?
642 ['--symbol-check', GetoptLong::NO_ARGUMENT],
643 ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
645 ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
646 ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
647 # prepend filenames to commit logs?
648 ['--log-filename', GetoptLong::NO_ARGUMENT],
649 ['--no-log-filename', GetoptLong::NO_ARGUMENT],
650 # skip branches when exporting a whole tree?
651 ['--skip-branches', GetoptLong::NO_ARGUMENT],
652 ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
655 # We read options in order, but they apply to all passed parameters.
656 # TODO maybe they should only apply to the following, unless there's only one
658 opts.ordering = GetoptLong::RETURN_IN_ORDER
662 :authors => Hash.new,
667 # Read config options
668 `git config --get-all rcs.authorsfile`.each_line do |fn|
669 parse_options[:authors].merge! load_authors_file(fn.chomp)
672 parse_options[:tag_each_rev] = (
673 `git config --bool rcs.tageachrev`.chomp == 'true'
676 parse_options[:log_filename] = (
677 `git config --bool rcs.logfilename`.chomp == 'true'
680 fuzz = `git config --int rcs.commitFuzz`.chomp
681 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
683 fuzz = `git config --int rcs.tagFuzz`.chomp
684 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
686 parse_options[:symbol_check] = (
687 `git config --bool rcs.symbolcheck`.chomp == 'false'
690 opts.each do |opt, arg|
692 when '--authors-file'
693 authors = load_authors_file(arg)
694 redef = parse_options[:authors].keys & authors.keys
695 STDERR.puts "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
696 parse_options[:authors].merge!(authors)
697 when '--rcs-suffixes'
699 when '--rcs-commit-fuzz'
700 parse_options[:commit_fuzz] = arg.to_i
701 when '--rcs-tag-fuzz'
702 parse_options[:tag_fuzz] = arg.to_i
703 when '--symbol-check'
704 parse_options[:symbol_check] = true
705 when '--no-symbol-check'
706 parse_options[:symbol_check] = false
707 when '--tag-each-rev'
708 parse_options[:tag_each_rev] = true
709 when '--no-tag-each-rev'
710 # this is the default, which is fine since the missing key
711 # (default) returns nil which is false in Ruby
712 parse_options[:tag_each_rev] = false
713 when '--log-filename'
714 parse_options[:log_filename] = true
715 when '--no-log-filename'
716 # this is the default, which is fine since the missing key
717 # (default) returns nil which is false in Ruby
718 parse_options[:log_filename] = false
719 when '--skip-branches'
720 parse_options[:skip_branches] = true
729 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
730 parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
735 user = Etc.getlogin || ENV['USER']
737 # steal username/email data from other init files that may contain the
741 # the user's .hgrc file for a username field
742 ['~/.hgrc', /^\s*username\s*=\s*(["'])?(.*)\1$/, 2],
743 # the user's .(g)vimrc for a changelog_username setting
744 ['~/.vimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
745 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
747 ].each do |fn, rx, idx|
748 file = File.expand_path fn
749 if File.readable?(file) and File.read(file) =~ rx
750 parse_options[:authors][user] = Regexp.last_match(idx).strip
756 if user and not user.empty? and not parse_options[:authors].has_key?(user)
757 name = ENV['GIT_AUTHOR_NAME'] || ''
758 name.replace(`git config user.name`.chomp) if name.empty?
759 name.replace(Etc.getpwnam(user).gecos) if name.empty?
762 # couldn't find a name, try to steal data from other sources
765 # if we found a name, try to find an email too
766 email = ENV['GIT_AUTHOR_EMAIL'] || ''
767 email.replace(`git config user.email`.chomp) if email.empty?
770 # couldn't find an email, try to steal data too
773 # we got both a name and email, fill the info
774 parse_options[:authors][user] = "#{name} <#{email}>"
789 file_list.each do |arg|
790 case ftype = File.ftype(arg)
796 not_found "RCS file #{arg}"
799 filename = File.basename(arg, SFX)
801 filename = File.basename(arg)
802 path = File.dirname(arg)
803 rcsfile = File.join(path, 'RCS', filename) + SFX
804 unless File.exists? rcsfile
805 rcsfile.replace File.join(path, filename) + SFX
806 unless File.exists? rcsfile
807 not_found "RCS file for #{filename} in #{path}"
811 rcs << RCS.parse(filename, rcsfile)
813 pattern = File.join(arg, '**', '*' + SFX)
814 Dir.glob(pattern).each do |rcsfile|
815 filename = File.basename(rcsfile, SFX)
816 path = File.dirname(rcsfile)
817 path.sub!(/\/?RCS$/, '') # strip final /RCS if present
818 path.sub!(/^#{Regexp.escape arg}\/?/, '') # strip initial dirname
819 filename = File.join(path, filename) unless path.empty?
821 rcs << RCS.parse(filename, rcsfile)
822 rescue Exception => e
823 STDERR.puts "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
828 STDERR.puts "Cannot handle #{arg} of #{ftype} type"
834 rcs.first.export_commits(parse_options)
836 STDERR.puts "Preparing commits"
841 r.revision.each do |k, rev|
843 commits << RCS::Commit.new(r, rev)
844 rescue NoBranchSupport
845 if parse_options[:skip_branches]
846 STDERR.puts "Skipping revision #{rev.rev} for #{r.fname} (branch)"
853 STDERR.puts "Sorting by date"
858 STDERR.puts "RAW commits (#{commits.length}):"
860 PP.pp c.to_a, $stderr
863 STDERR.puts "#{commits.length} single-file commits"
866 STDERR.puts "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
868 commits.reverse_each do |c|
869 commits.reverse_each do |k|
870 break if k.date < c.date - parse_options[:commit_fuzz]
872 next if c.log != k.log or c.author != k.author or c.branch != k.branch
873 next if k.date > c.date
874 unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
875 if parse_options[:symbol_check]
876 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{c.tree.filenames.join(', ')})\n\tand (#{k.tree.filenames.join(', ')})"
877 STDERR.puts "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
878 STDERR.puts "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
881 STDERR.puts "Coalescing #{c.log.inspect}\n\tfor (#{c.tree.filenames.join(', ')})\n\tand (#{k.tree.filenames.join(', ')})"
882 STDERR.puts "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
887 rescue RuntimeError => err
888 fuzz = c.date - k.date
889 STDERR.puts "Fuzzy commit coalescing failed: #{err}"
890 STDERR.puts "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
898 STDERR.puts "[1] commits (#{commits.length}):"
900 PP.pp c.to_a, $stderr
903 STDERR.puts "#{commits.length} coalesced commits"
906 commits.each { |c| c.export(parse_options) }