5 * Refactor commit coalescing
6 * Add --strict-symbol-check to only coalesce commits if their symbol lists are equal
7 * Add support for commitid for coalescing commits
8 * Further coalescing options? (e.g. small logfile differences)
9 * Proper branching support in multi-file export
10 * Optimize memory usage by discarding unneeded text
16 class NoBranchSupport < NotImplementedError ; end
18 # Integer#odd? was introduced in Ruby 1.8.7, backport it to
20 unless 2.respond_to? :odd?
30 #{$0} [options] file [file ...]
32 Fast-export the RCS history of one or more files. If a directory is specified,
33 all RCS-tracked files in the directory and its descendants are exported.
35 When importing single files, their pathname is discarded during import. When
36 importing directories, only the specified directory component is discarded.
38 When importing a single file, RCS commits are converted one by one. Otherwise,
39 some heuristics is used to determine how to coalesce commits touching different
42 Currently, commits are coalesced if they share the exact same log and if their
43 date differs by no more than the user-specified fuzziness. Additionally, the
44 symbols in one of the commit must be a subset of the symbols in the other
45 commit, unless --no-symbol-check is specified or rcs.symbolCheck is set to
46 false in the git configuration.
49 git init && rcs-fast-export.rb . | git fast-import && git reset
52 --help, -h, -? display this help text
53 --authors-file, -A specify a file containing username = Full Name <email> mappings
54 --rcs-commit-fuzz fuzziness in RCS commits to be considered a single one when
55 importing multiple files
56 (in seconds, defaults to 300, i.e. 5 minutes)
57 --[no-]symbol-check [do not] check symbols when coalescing commits
58 --[no-]tag-each-rev [do not] create a lightweight tag for each RCS revision when
59 importing a single file
60 --[no-]log-filename [do not] prepend the filename to the commit log when importing
62 --skip-branches when exporting multiple files with a branched history, export
63 the main branch only instead of aborting due to the lack of
64 support for branched multi-file history export
69 rcs.authorsFile for --authors-file
70 rcs.tagEachRev for --tag-each-rev
71 rcs.logFilename for --log-filename
72 rcs.commitFuzz for --rcs-commit-fuzz
73 rcs.symbolCheck for --rcs-symbol-check
74 rcs.tagFuzz for --rcs-tag-fuzz
80 STDERR.puts "Could not find #{arg}"
83 # returns a hash that maps usernames to author names & emails
84 def load_authors_file(fn)
87 File.open(File.expand_path(fn)) do |io|
88 io.each_line do |line|
89 uname, author = line.split('=', 2)
92 STDERR.puts "Username #{uname} redefined to #{author}" if hash.has_key? uname
104 fields = string.split('.')
105 raise ArgumentError, "wrong number of fields for RCS date #{string}" unless fields.length == 6
111 # strip an optional final ;
116 # strip the first and last @, and de-double @@s
117 def RCS.sanitize(arg)
121 raise 'malformed first line' unless ret.first[0,1] == '@'
122 raise 'malformed last line' unless ret.last[-1,1] == '@'
123 ret.first.sub!(/^@/,'')
124 ret.last.sub!(/@$/,'')
125 ret.map { |l| l.gsub('@@','@') }
127 arg.chomp('@').sub(/^@/,'').gsub('@@','@')
134 def RCS.at_clean(arg)
135 RCS.sanitize RCS.clean(arg)
143 @@marks[key] = @@marks.length + 1
147 def RCS.blob(file, rev)
148 RCS.mark([file, rev])
151 def RCS.commit(commit)
156 attr_accessor :head, :comment, :desc, :revision, :fname, :mode
157 def initialize(fname, executable)
162 @revision = Hash.new { |h, r| h[r] = Revision.new(self, r) }
163 @mode = executable ? '755' : '644'
166 def has_revision?(rev)
167 @revision.has_key?(rev) and not @revision[rev].author.nil?
170 def export_commits(opts={})
173 until @revision.empty?
176 # a string sort is a very good candidate for
177 # export order, getting a miss only for
178 # multi-digit revision components
179 keys = @revision.keys.sort
181 STDERR.puts "commit export loop ##{counter}"
182 STDERR.puts "\t#{exported.length} commits exported so far: #{exported.join(', ')}" unless exported.empty?
183 STDERR.puts "\t#{keys.size} to export: #{keys.join(', ')}"
187 # the parent commit is rev.next if we're on the
188 # master branch (rev.branch is nil) or
189 # rev.diff_base otherwise
190 from = rev.branch.nil? ? rev.next : rev.diff_base
191 # A commit can only be exported if it has no
192 # parent, or if the parent has been exported
193 # already. Skip this commit otherwise
194 if from and not exported.include? from
198 branch = rev.branch || 'master'
199 author = opts[:authors][rev.author] || "#{rev.author} <empty>"
200 date = "#{rev.date.tv_sec} +0000"
202 if opts[:log_filename]
203 log << @fname << ": "
207 puts "commit refs/heads/#{branch}"
208 puts "mark :#{RCS.commit key}"
209 puts "committer #{author} #{date}"
210 puts "data #{log.length}"
211 puts log unless log.empty?
212 puts "from :#{RCS.commit from}" if rev.branch_point
213 puts "M #{@mode} :#{RCS.blob @fname, key} #{@fname}"
215 # TODO FIXME this *should* be safe, in
216 # that it should not unduly move
217 # branches back in time, but I'm not
219 rev.branches.each do |sym|
220 puts "reset refs/heads/#{sym}"
221 puts "from :#{RCS.commit key}"
223 rev.symbols.each do |sym|
224 puts "reset refs/tags/#{sym}"
225 puts "from :#{RCS.commit key}"
227 if opts[:tag_each_rev]
228 puts "reset refs/tags/#{key}"
229 puts "from :#{RCS.commit key}"
234 exported.each { |k| @revision.delete(k) }
240 attr_accessor :rev, :author, :state, :next
241 attr_accessor :branches, :log, :text, :symbols
242 attr_accessor :branch, :diff_base, :branch_point
244 def initialize(file, rev)
261 @date = Time.rcs(str)
266 ret = "blob\nmark :#{RCS.blob @file.fname, @rev}\ndata #{str.length}\n#{str}\n"
271 def RCS.parse(fname, rcsfile)
272 rcs = RCS::File.new(fname, ::File.executable?(rcsfile))
274 ::File.open(rcsfile, 'r:ASCII-8BIT') do |file|
279 file.each_line do |line|
282 command, args = line.split($;,2)
283 next if command.empty?
285 if command.chomp!(';')
286 STDERR.puts "Skipping empty command #{command.inspect}" if $DEBUG
292 rcs.head = RCS.clean(args.chomp)
298 rcs.comment = RCS.at_clean(args.chomp)
301 if rcs.has_revision?(rev)
302 status.push :revision_data
304 status.push :new_revision
309 status.push :read_lines
310 when 'branch', 'access', 'locks', 'expand'
311 STDERR.puts "Skipping unhandled command #{command.inspect}" if $DEBUG
312 status.push :skipping_lines
316 raise "Unknown command #{command.inspect}"
319 status.pop if line.strip.chomp!(';')
321 # we can have multiple symbols per line
322 pairs = line.strip.split($;)
324 sym, rev = pair.strip.split(':',2);
326 status.pop if rev.chomp!(';')
327 rcs.revision[rev].symbols << sym
333 rcs.desc.replace lines.dup
336 # we sanitize lines as we read them
338 actual_line = line.dup
340 # the first line must begin with a @, which we strip
342 ats = line.match(/^@+/)
343 raise 'malformed line' unless ats
344 actual_line.replace line.sub(/^@/,'')
347 # if the line ends with an ODD number of @, it's the
348 # last line -- we work on actual_line so that content
349 # such as @\n or @ work correctly (they would be
350 # encoded respectively as ['@@@\n','@\n'] and
352 ats = actual_line.chomp.match(/@+$/)
353 if nomore = (ats && Regexp.last_match(0).length.odd?)
354 actual_line.replace actual_line.chomp.sub(/@$/,'')
356 lines << actual_line.gsub('@@','@')
363 when /^date\s+(\S+);\s+author\s+(\S+);\s+state\s+(\S+);$/
364 rcs.revision[rev].date = $1
365 rcs.revision[rev].author = $2
366 rcs.revision[rev].state = $3
368 status.push :branches
371 when /^next\s+(\S+)?;$/
372 nxt = rcs.revision[rev].next = $1
374 raise "multiple diff_bases for #{nxt}" unless rcs.revision[nxt].diff_base.nil?
375 rcs.revision[nxt].diff_base = rev
376 rcs.revision[nxt].branch = rcs.revision[rev].branch
381 candidate = line.split(';',2)
382 branch = candidate.first.strip
383 rcs.revision[rev].branches << branch
384 raise "multiple diff_bases for #{branch}" unless rcs.revision[branch].diff_base.nil?
385 rcs.revision[branch].diff_base = rev
386 # we drop the last number from the branch name
387 rcs.revision[branch].branch = branch.sub(/\.\d+$/,'.x')
388 rcs.revision[branch].branch_point = rev
389 status.pop if candidate.length > 1
395 status.push :read_lines
403 status.push :read_lines
408 rcs.revision[rev].log.replace lines.dup
411 rcs.revision[rev].text.replace lines.dup
412 puts rcs.revision[rev].blob
415 difflines.replace lines.dup
416 difflines.pop if difflines.last.empty?
417 base = rcs.revision[rev].diff_base
418 unless rcs.revision[base].text
421 raise 'no diff base!'
425 rcs.revision[base].text.each { |l| buffer << [l.dup] }
431 while l = difflines.shift
433 raise 'negative index during insertion' if index < 0
434 raise 'negative count during insertion' if count < 0
437 # collected all the lines, put the before
442 buffer[index].unshift(*adding)
449 raise 'malformed diff' unless l =~ /^([ad])(\d+) (\d+)$/
455 # for deletion, index 1 is the first index, so the Ruby
456 # index is one less than the diff one
458 # we replace them with empty string so that 'a' commands
459 # referring to the same line work properly
466 # addition will prepend the appropriate lines
467 # to the given index, and in this case Ruby
468 # and diff indices are the same
473 # turn the buffer into an array of lines, deleting the empty ones
474 buffer.delete_if { |l| l.empty? }
477 rcs.revision[rev].text = buffer
478 puts rcs.revision[rev].blob
481 raise "Unknown status #{status.last}"
486 # clean up the symbols/branches: look for revisions that have
487 # one or more symbols but no dates, and make them into
488 # branches, pointing to the highest commit with that key
490 keys = rcs.revision.keys
491 rcs.revision.each do |key, rev|
492 if rev.date.nil? and not rev.symbols.empty?
493 top = keys.select { |k| k.match(/^#{key}\./) }.sort.last
494 tr = rcs.revision[top]
495 raise "unhandled complex branch structure met: #{rev.inspect} refers #{tr.inspect}" if tr.date.nil?
496 tr.branches |= rev.symbols
500 branches.each { |k| rcs.revision.delete k }
506 def initialize(commit)
512 testfiles = @files.dup
513 tree.each { |rcs, rev| self.add(rcs, rev, testfiles) }
514 # the next line is only reached if all the adds were
515 # successful, so the merge is atomic
516 @files.replace testfiles
519 def add(rcs, rev, file_list=@files)
520 if file_list.key? rcs
521 prev = file_list[rcs]
522 if prev.log == rev.log
523 str = "re-adding existing file #{rcs.fname} (old: #{prev.rev}, new: #{rev.rev})"
525 str = "re-adding existing file #{rcs.fname} (old: #{[prev.rev, prev.log.to_s].inspect}, new: #{[rev.rev, rev.log.to_s].inspect})"
527 if prev.text != rev.text
530 @commit.warn_about str
542 @files.map do |rcs, rev|
543 if rev.state.downcase == "dead"
544 files << "D #{rcs.fname}"
546 files << "M #{rcs.mode} :#{RCS.blob rcs.fname, rev.rev} #{rcs.fname}"
553 @files.map { |rcs, rev| rcs.fname }
562 attr_accessor :date, :log, :symbols, :author, :branch
564 attr_accessor :min_date, :max_date
565 def initialize(rcs, rev)
566 raise NoBranchSupport if rev.branch
567 self.date = rev.date.dup
568 self.min_date = self.max_date = self.date
569 self.log = rev.log.dup
570 self.symbols = rev.symbols.dup
571 self.author = rev.author
572 self.branch = rev.branch
574 self.tree = Tree.new self
575 self.tree.add rcs, rev
579 [self.min_date, self.date, self.max_date, self.branch, self.symbols, self.author, self.log, self.tree.to_a]
583 warn str + " for commit on #{self.date}"
586 # Sort by date and then by number of symbols
588 ds = self.date <=> other.date
592 return self.symbols.length <=> other.symbols.length
597 self.tree.merge! commit.tree
598 if commit.max_date > self.max_date
599 self.max_date = commit.max_date
601 if commit.min_date < self.min_date
602 self.min_date = commit.min_date
604 self.symbols.merge commit.symbols
608 xbranch = self.branch || 'master'
609 xauthor = opts[:authors][self.author] || "#{self.author} <empty>"
611 numdate = self.date.tv_sec
612 xdate = "#{numdate} +0000"
615 puts "commit refs/heads/#{xbranch}"
616 puts "mark :#{RCS.commit key}"
617 puts "committer #{xauthor} #{xdate}"
618 puts "data #{xlog.length}"
619 puts xlog unless xlog.empty?
620 # TODO branching support for multi-file export
621 # puts "from :#{RCS.commit from}" if self.branch_point
624 # TODO branching support for multi-file export
625 # rev.branches.each do |sym|
626 # puts "reset refs/heads/#{sym}"
627 # puts "from :#{RCS.commit key}"
630 self.symbols.each do |sym|
631 puts "reset refs/tags/#{sym}"
632 puts "from :#{RCS.commit key}"
641 opts = GetoptLong.new(
642 # Authors file, like git-svn and git-cvsimport, more than one can be
644 ['--authors-file', '-A', GetoptLong::REQUIRED_ARGUMENT],
645 # RCS file suffix, like RCS
646 ['--rcs-suffixes', '-x', GetoptLong::REQUIRED_ARGUMENT],
647 # Date fuzziness for commits to be considered the same (in seconds)
648 ['--rcs-commit-fuzz', GetoptLong::REQUIRED_ARGUMENT],
649 # check symbols when coalescing?
650 ['--symbol-check', GetoptLong::NO_ARGUMENT],
651 ['--no-symbol-check', GetoptLong::NO_ARGUMENT],
653 ['--tag-each-rev', GetoptLong::NO_ARGUMENT],
654 ['--no-tag-each-rev', GetoptLong::NO_ARGUMENT],
655 # prepend filenames to commit logs?
656 ['--log-filename', GetoptLong::NO_ARGUMENT],
657 ['--no-log-filename', GetoptLong::NO_ARGUMENT],
658 # skip branches when exporting a whole tree?
659 ['--skip-branches', GetoptLong::NO_ARGUMENT],
660 ['--help', '-h', '-?', GetoptLong::NO_ARGUMENT]
663 # We read options in order, but they apply to all passed parameters.
664 # TODO maybe they should only apply to the following, unless there's only one
666 opts.ordering = GetoptLong::RETURN_IN_ORDER
670 :authors => Hash.new,
675 # Read config options
676 `git config --get-all rcs.authorsfile`.each_line do |fn|
677 parse_options[:authors].merge! load_authors_file(fn.chomp)
680 parse_options[:tag_each_rev] = (
681 `git config --bool rcs.tageachrev`.chomp == 'true'
684 parse_options[:log_filename] = (
685 `git config --bool rcs.logfilename`.chomp == 'true'
688 fuzz = `git config --int rcs.commitFuzz`.chomp
689 parse_options[:commit_fuzz] = fuzz.to_i unless fuzz.empty?
691 fuzz = `git config --int rcs.tagFuzz`.chomp
692 parse_options[:tag_fuzz] = fuzz.to_i unless fuzz.empty?
694 parse_options[:symbol_check] = (
695 `git config --bool rcs.symbolcheck`.chomp == 'false'
698 opts.each do |opt, arg|
700 when '--authors-file'
701 authors = load_authors_file(arg)
702 redef = parse_options[:authors].keys & authors.keys
703 STDERR.puts "Authors file #{arg} redefines #{redef.join(', ')}" unless redef.empty?
704 parse_options[:authors].merge!(authors)
705 when '--rcs-suffixes'
707 when '--rcs-commit-fuzz'
708 parse_options[:commit_fuzz] = arg.to_i
709 when '--rcs-tag-fuzz'
710 parse_options[:tag_fuzz] = arg.to_i
711 when '--symbol-check'
712 parse_options[:symbol_check] = true
713 when '--no-symbol-check'
714 parse_options[:symbol_check] = false
715 when '--tag-each-rev'
716 parse_options[:tag_each_rev] = true
717 when '--no-tag-each-rev'
718 # this is the default, which is fine since the missing key
719 # (default) returns nil which is false in Ruby
720 parse_options[:tag_each_rev] = false
721 when '--log-filename'
722 parse_options[:log_filename] = true
723 when '--no-log-filename'
724 # this is the default, which is fine since the missing key
725 # (default) returns nil which is false in Ruby
726 parse_options[:log_filename] = false
727 when '--skip-branches'
728 parse_options[:skip_branches] = true
737 if parse_options[:tag_fuzz] < parse_options[:commit_fuzz]
738 parse_options[:tag_fuzz] = parse_options[:commit_fuzz]
743 user = Etc.getlogin || ENV['USER']
745 # steal username/email data from other init files that may contain the
749 # the user's .hgrc file for a username field
750 ['~/.hgrc', /^\s*username\s*=\s*(["'])?(.*)\1$/, 2],
751 # the user's .(g)vimrc for a changelog_username setting
752 ['~/.vimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
753 ['~/.gvimrc', /changelog_username\s*=\s*(["'])?(.*)\1$/, 2],
755 ].each do |fn, rx, idx|
756 file = File.expand_path fn
757 if File.readable?(file) and File.read(file) =~ rx
758 parse_options[:authors][user] = Regexp.last_match(idx).strip
764 if user and not user.empty? and not parse_options[:authors].has_key?(user)
765 name = ENV['GIT_AUTHOR_NAME'] || ''
766 name.replace(`git config user.name`.chomp) if name.empty?
767 name.replace(Etc.getpwnam(user).gecos) if name.empty?
770 # couldn't find a name, try to steal data from other sources
773 # if we found a name, try to find an email too
774 email = ENV['GIT_AUTHOR_EMAIL'] || ''
775 email.replace(`git config user.email`.chomp) if email.empty?
778 # couldn't find an email, try to steal data too
781 # we got both a name and email, fill the info
782 parse_options[:authors][user] = "#{name} <#{email}>"
797 file_list.each do |arg|
798 case ftype = File.ftype(arg)
804 not_found "RCS file #{arg}"
807 filename = File.basename(arg, SFX)
809 filename = File.basename(arg)
810 path = File.dirname(arg)
811 rcsfile = File.join(path, 'RCS', filename) + SFX
812 unless File.exists? rcsfile
813 rcsfile.replace File.join(path, filename) + SFX
814 unless File.exists? rcsfile
815 not_found "RCS file for #{filename} in #{path}"
819 rcs << RCS.parse(filename, rcsfile)
821 pattern = File.join(arg, '**', '*' + SFX)
822 Dir.glob(pattern).each do |rcsfile|
823 filename = File.basename(rcsfile, SFX)
824 path = File.dirname(rcsfile)
825 path.sub!(/\/?RCS$/, '') # strip final /RCS if present
826 path.sub!(/^#{Regexp.escape arg}\/?/, '') # strip initial dirname
827 filename = File.join(path, filename) unless path.empty?
829 rcs << RCS.parse(filename, rcsfile)
830 rescue Exception => e
831 STDERR.puts "Failed to parse #{filename} @ #{rcsfile}:#{$.}"
836 STDERR.puts "Cannot handle #{arg} of #{ftype} type"
842 rcs.first.export_commits(parse_options)
844 STDERR.puts "Preparing commits"
849 r.revision.each do |k, rev|
851 commits << RCS::Commit.new(r, rev)
852 rescue NoBranchSupport
853 if parse_options[:skip_branches]
854 STDERR.puts "Skipping revision #{rev.rev} for #{r.fname} (branch)"
861 STDERR.puts "Sorting by date"
866 STDERR.puts "RAW commits (#{commits.length}):"
868 PP.pp c.to_a, $stderr
871 STDERR.puts "#{commits.length} single-file commits"
874 STDERR.puts "Coalescing [1] by date with fuzz #{parse_options[:commit_fuzz]}"
876 thisindex = commits.size
877 commits.reverse_each do |c|
878 nextindex = thisindex
881 cfiles = Set.new c.tree.filenames
886 # test for mergeable commits by looking at following commits
887 while nextindex < commits.size
888 k = commits[nextindex]
891 # commits are date-sorted, so we know we can quit early if we are too far
892 # for coalescing to work
893 break if k.min_date > c.max_date + parse_options[:commit_fuzz]
897 kfiles = Set.new k.tree.filenames
899 if c.log != k.log or c.author != k.author or c.branch != k.branch
903 unless c.symbols.subset?(k.symbols) or k.symbols.subset?(c.symbols)
904 cflist = cfiles.to_a.join(', ')
905 kflist = kfiles.to_a.join(', ')
906 if parse_options[:symbol_check]
907 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
908 STDERR.puts "\tbecause their symbols disagree:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
909 STDERR.puts "\tretry with the --no-symbol-check option if you want to merge these commits anyway"
912 STDERR.puts "Coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
913 STDERR.puts "\twith disagreeing symbols:\n\t#{c.symbols.to_a.inspect} and #{k.symbols.to_a.inspect} disagree on #{(c.symbols ^ k.symbols).to_a.inspect}"
917 # keep track of filenames touched by commits we are not merging with,
918 # since we don't want to merge with commits that touch them, to preserve
919 # the monotonicity of history for each file
920 # TODO we could forward-merge with them, unless some of our files were
923 # if the candidate touches any file already in the commit,
924 # we can stop looking forward
925 break unless cfiles.intersection(kfiles).empty?
930 # the candidate has the same log, author, branch and appropriate symbols
931 # does it touch anything in ofiles?
932 unless ofiles.intersection(kfiles).empty?
934 cflist = cfiles.to_a.join(', ')
935 kflist = kfiles.to_a.join(', ')
936 oflist = ofiles.to_a.join(', ')
937 STDERR.puts "Not coalescing #{c.log.inspect}\n\tfor (#{cflist})\n\tand (#{kflist})"
938 STDERR.puts "\tbecause the latter intersects #{oflist} in #{(ofiles & kfiles).to_a.inspect}"
946 mergeable.each do |k|
949 rescue RuntimeError => err
950 fuzz = c.date - k.date
951 STDERR.puts "Fuzzy commit coalescing failed: #{err}"
952 STDERR.puts "\tretry with commit fuzz < #{fuzz} if you don't want to see this message"
960 STDERR.puts "[1] commits (#{commits.length}):"
962 PP.pp c.to_a, $stderr
965 STDERR.puts "#{commits.length} coalesced commits"
968 commits.each { |c| c.export(parse_options) }