1 # The Detector determines which Monoglot or Polyglot should be
2 # used to parse a source file.
4 # The Detector primarily uses filename extensions to identify languages.
6 # The hash EXTENSION_MAP maps a filename extension to the name of a parser.
8 # If a filename extension is not enough to determine the correct parser (for
9 # instance, the *.m extension can indicate either a Matlab or Objective-C file),
10 # then the EXTENSION_MAP hash will contain a symbol identifying a Ruby method
11 # which will be invoked. This Ruby method can examine the file
12 # contents and return the name of the correct parser.
14 # Many source files do not have an extension. The method +disambiguate_nil+
15 # is called in these cases. The +file+ command line tool is used to determine
16 # the type of file and select a parser.
18 # The Detector is covered by DetectorTest.
20 class Ohcount::Detector
23 # A performance hack -- once we've checked for the presence of *.m files, the result
24 # is stored here to avoid checking twice.
25 attr_accessor :contains_m
26 # A performance hack -- once we've checked for the presence of *.pike and *.pmod files, the result
27 # is stored here to avoid checking twice.
28 attr_accessor :contains_pike_or_pmod
31 # The primary entry point for the detector.
32 # Given a file context containing the file name, content, and an array of
33 # other filenames in the source tree, attempts to detect which
34 # language family (Monoglot or Polyglot) is in use for this file.
36 # Returns nil if the language is not recognized or if the file does not
41 # # List all C files in the 'src' directory
42 # Dir.entries("src").each do |file|
43 # context = Ohcount::SimpleFileContext.new(file)
44 # polyglot = Ohcount::Detector.detect(context)
45 # puts "#{file}" if polyglot == 'c'
48 def self.detect(file_context)
49 # start with extension
50 polyglot = EXTENSION_MAP[File.extname(file_context.filename)]
51 polyglot = EXTENSION_MAP[File.extname(file_context.filename).downcase] unless polyglot
55 return polyglot if polyglot.is_a?(String)
57 # extension is ambiguous - requires custom disambiguation
58 self.send(polyglot, file_context)
60 return disambiguate_nil(file_context)
62 raise RuntimeError.new("Unknown file detection type")
66 # Based solely on the filename, makes a judgment whether a file is a binary format.
67 def self.binary_filename?(filename)
95 ignore.include?(File.extname(filename))
98 # If an extension maps to a string, that string must be the name of a glot.
99 # If an extension maps to a Ruby symbol, that symbol must be the name of a
100 # Ruby method which will return the name of a glot.
105 '.asm' => "assembler",
107 '.bas' => "visualbasic",
118 '.el' => "emacslisp",
121 '.cs' => :disambiguate_cs,
124 '.f' => :disambiguate_fortran,
125 '.ftn' => :disambiguate_fortran,
126 '.f77' => :disambiguate_fortran,
127 '.f90' => :disambiguate_fortran,
128 '.f95' => :disambiguate_fortran,
129 '.f03' => :disambiguate_fortran,
130 '.F' => :disambiguate_fortran,
131 '.F77' => :disambiguate_fortran,
132 '.F90' => :disambiguate_fortran,
133 '.F95' => :disambiguate_fortran,
134 '.F03' => :disambiguate_fortran,
135 '.frx' => "visualbasic",
136 '.groovy'=> "groovy",
137 '.h' => :disambiguate_h_header,
147 '.in' => :disambiguate_in,
148 '.inc' => :disambiguate_inc,
150 '.js' => "javascript",
152 '.lhs' => "lit_haskell",
156 '.m' => :matlab_or_objective_c,
158 '.mm' => "objective_c",
159 '.mp' => 'metapost_with_tex',
185 '.st' => "smalltalk",
189 '.vb' => "visualbasic",
190 '.vba' => "visualbasic",
191 '.vbs' => "visualbasic",
193 '.web' => "knuth_web",
195 '.xsd' => "xmlschema",
206 # Returns a count of lines in the buffer matching the given regular expression.
207 def self.lines_matching(buffer, re)
208 buffer.inject(0) { |total, line| line =~ re ? total+1 : total }
211 # For *.m files, differentiates Matlab from Objective-C.
213 # This is done with a weighted heuristic that
214 # scans the *.m file contents for keywords,
215 # and also checks for the presence of matching *.h files.
216 def self.matlab_or_objective_c(file_context)
217 buffer = file_context.contents
219 # if there are .h files in same directory, this probably isn't matlab
221 h_headers = -0.5 if file_context.filenames.select { |a| a =~ /\.h$/ }.any?
223 # if the contents contain 'function (' on a single line - very likely to be matlab
224 # if the contents contain lines starting with '%', its probably matlab comments
225 matlab_signatures = /(^\s*function\s*)|(^\s*%)/
226 matlab_sig_score = 0.1 * lines_matching(buffer, matlab_signatures)
228 # if the contents contains '//' or '/*', likely objective_c
229 objective_c_signatures = /(^\s*\/\/\s*)|(^\s*\/\*)|(^[+-])/
230 obj_c_sig_score = -0.1 * lines_matching(buffer, objective_c_signatures)
232 matlab = h_headers + matlab_sig_score + obj_c_sig_score
234 matlab > 0 ? 'matlab' : 'objective_c'
237 # For *.h files, differentiates C, C++ and Objective-C.
239 # This is done with a weighted heuristic that
240 # scans the *.h file contents for Objective-C keywords,
241 # C++ keywords and C++ headers, and also checks for the
242 # presence of matching *.m files.
243 def self.disambiguate_h_header(file_context)
244 buffer = file_context.contents
246 # could it be realistically be objective_c ? are there any .m files at all?
247 # Speed hack - remember our findings in case we get the same filenames over and over
248 unless defined?(file_context.filenames.contains_m)
249 file_context.filenames.extend(ContainsM)
250 file_context.filenames.contains_m = file_context.filenames.select { |a| a =~ /\.m$/ }.any?
251 file_context.filenames.contains_pike_or_pmod = file_context.filenames.select { |a| a =~ /\.p(ike|mod)$/ }.any?
254 if file_context.filenames.contains_m
255 # if the dir contains a matching *.m file, likely objective_c
256 if file_context.filename =~ /\.h$/
257 m_counterpart = file_context.filename.gsub(/\.h$/, ".m")
258 return 'objective_c' if file_context.filenames.include?(m_counterpart)
261 # ok - it just might be objective_c, let's check contents for objective_c signatures
262 objective_c_signatures = /(^\s*@interface)|(^\s*@end)/
263 objective_c = lines_matching(buffer, objective_c_signatures)
264 return 'objective_c' if objective_c > 1
267 if file_context.filenames.contains_pike_or_pmod
268 # The string "pike" and a selection of common Pike keywords.
269 pike_signatures = /([Pp][Ii][Kk][Ee])|(string )|(mapping)|(multiset)|(import )|(inherit )|(predef)/
270 pike = lines_matching(buffer, pike_signatures)
271 return 'pike' if pike > 0
274 disambiguate_c_cpp(buffer)
277 # A map of headers that indicate C++, but that do not have C++-specific file
278 # extensions. This list is made from the Standard, plus Technical Report 1.
279 CPP_HEADERS_MAP = %w[
369 ].inject({}) { | h, k | h[k] = true ; h }
371 # A map of keywords that indicate C++.
372 CPP_KEYWORDS_MAP = %w[
377 ].inject({}) { | h, k | h[k] = true ; h }
379 # For *.h files that we know aren't Objective-C, differentiates C and C++.
381 # This is done with a weighted heuristic that
382 # scans the *.h file contents for C++ keywords and C++ headers.
383 def self.disambiguate_c_cpp(buffer)
384 # Look for C++ headers
385 return 'cpp' if extract_c_cpp_headers(buffer).detect do | header |
386 EXTENSION_MAP[File.extname(header)] == 'cpp' or CPP_HEADERS_MAP.include? header
389 # Look for C++ keywords. This could check for comments, but doesn't.
390 return 'cpp' if buffer.find do | line |
391 line.split(/\W/).find do | word |
392 CPP_KEYWORDS_MAP.include? word
396 # Nothing to suggest C++
400 # Return a list of files included in a C or C++ source file.
401 def self.extract_c_cpp_headers(buffer)
402 buffer.map do | line |
403 m = line.match(/^#\s*include\s+[<"](.*)[>"]/) and m[1]
404 end.find_all { | a | a }
407 # Tests whether the provided buffer contains binary or text content.
408 # This is not fool-proof -- we basically just check for zero values
409 # in the early bytes of the buffer. If we find a zero, we know it
410 # is not (ascii) text.
411 def self.binary_buffer?(buffer)
413 return true if buffer[i] == 0
418 # True if the provided buffer includes a '?php' directive
419 def self.php_instruction?(buffer)
423 # For *.in files, checks the prior extension.
424 # Typically used for template files (eg Makefile.in, auto.c.in, etc).
425 def self.disambiguate_in(file_context)
426 # if the filename has an extension prior to the .in
427 if file_context.filename =~ /\..*\.in$/
428 filename = file_context.filename.gsub(/\.in$/, "")
429 context = Ohcount::SimpleFileContext.new(filename, file_context.filenames, file_context.contents, file_context.file_location)
430 return detect(context)
435 # For *.inc files, checks for a PHP class.
436 def self.disambiguate_inc(file_context)
437 buffer = file_context.contents
438 return nil if binary_buffer?(buffer)
439 return 'php' if php_instruction?(buffer)
443 # For files with extention *.cs, differentiates C# from Clearsilver.
444 def self.disambiguate_cs(file_context)
445 buffer = file_context.contents
446 return 'clearsilver_template' if lines_matching(file_context.contents, /\<\?cs/) > 0
450 def self.disambiguate_fortran(file_context)
451 buffer = file_context.contents
453 definitely_not_f77 = /^ [^0-9 ]{5}/
454 return 'fortranfixed' if lines_matching(buffer, definitely_not_f77) > 0
456 free_form_continuation = /&\s*\n\s*&/m
457 return 'fortranfree' if buffer.match(free_form_continuation)
459 possibly_fixed = /^ [0-9 ]{5}/
460 contig_number = /^\s*\d+\s*$/
461 buffer.scan(possibly_fixed) {|leader|
462 return 'fortranfixed' if !(leader =~ contig_number) }
463 # Might as well be free-form.
467 # Attempts to determine the Polyglot for files that do not have a
468 # filename extension.
470 # Relies on the bash +file+ command line tool as its primary method.
472 # There must be a file at <tt>file_context.file_location</tt> for +file+
475 def self.disambiguate_nil(file_context)
476 file_location = file_context.file_location
477 output = `file -b #{ file_location }`
479 when /([\w\/]+) script text/, /script text executable for ([\w\/]+)/
481 if script =~ /\/(\w*)$/
484 known_languages = EXTENSION_MAP.values
485 return script.downcase if known_languages.include?(script.downcase)
486 when /([\w\-]*) shell script text/