1 # Determines the language family (Monoglot or Polyglot) used by a source file.
2 class Ohcount::Detector
5 # A performance hack -- once we've checked for the presence of *.m files, the result
6 # is stored here to avoid checking twice.
7 attr_accessor :contains_m
10 # The primary entry point for the detector.
11 # Given a file context containing the file name, content, and an array of
12 # other filenames in the source tree, attempts to detect which
13 # language family (Monoglot or Polyglot) is in use for this file.
15 # Returns nil if the language is not recognized or if the file does not
17 def self.detect(file_context)
18 # start with extension
19 polyglot = EXTENSION_MAP[File.extname(file_context.filename).downcase]
23 return polyglot if polyglot.is_a?(String)
25 # extension is ambiguous - requires custom disambiguation
26 self.send(polyglot, file_context)
28 return disambiguate_nil(file_context)
30 raise RuntimeError.new("Unknown file detection type")
34 # Based solely on the filename, makes a judgment whether a file is a binary format.
35 def self.binary_filename?(filename)
63 ignore.include?(File.extname(filename))
69 '.asm' => "assembler",
71 '.bas' => "visualbasic",
83 '.cs' => :disambiguate_cs,
86 '.frx' => "visualbasic",
88 '.h' => :disambiguate_h_header,
94 '.inc' => :disambiguate_inc,
96 '.js' => "javascript",
101 '.m' => :matlab_or_objective_c,
102 '.mm' => "objective_c",
126 '.vb' => "visualbasic",
127 '.vba' => "visualbasic",
128 '.vbs' => "visualbasic",
136 # Returns a count of lines in the buffer matching the given regular expression.
137 def self.lines_matching(buffer, re)
138 buffer.inject(0) { |total, line| line =~ re ? total+1 : total }
141 # For *.m files, differentiates Matlab from Objective-C.
143 # This is done with a weighted heuristic that
144 # scans the *.m file contents for keywords,
145 # and also checks for the presence of matching *.h files.
146 def self.matlab_or_objective_c(file_context)
147 buffer = file_context.contents
149 # if there are .h files in same directory, this probably isn't matlab
151 h_headers = -0.5 if file_context.filenames.select { |a| a =~ /\.h$/ }.any?
153 # if the contents contain 'function (' on a single line - very likely to be matlab
154 # if the contents contain lines starting with '%', its probably matlab comments
155 matlab_signatures = /(^\s*function\s*)|(^\s*%)/
156 matlab_sig_score = 0.1 * lines_matching(buffer, matlab_signatures)
158 # if the contents contains '//' or '/*', likely objective_c
159 objective_c_signatures = /(^\s*\/\/\s*)|(^\s*\/\*)|(^[+-])/
160 obj_c_sig_score = -0.1 * lines_matching(buffer, objective_c_signatures)
162 matlab = h_headers + matlab_sig_score + obj_c_sig_score
164 matlab > 0 ? 'matlab' : 'objective_c'
167 # For *.h files, differentiates C/C++ from Objective-C.
169 # This is done with a weighted heuristic that
170 # scans the *.h file contents for Objective-C keywords,
171 # and also checks for the presence of matching *.m files.
172 def self.disambiguate_h_header(file_context)
173 buffer = file_context.contents
177 # could it be realistically be objective_c ? are there any .m files at all?
178 # Speed hack - remember our findings in case we get the same filenames over and over
179 unless defined?(file_context.filenames.contains_m)
180 file_context.filenames.extend(ContainsM)
181 file_context.filenames.contains_m = file_context.filenames.select { |a| a =~ /\.m$/ }.any?
183 return 'cncpp' unless file_context.filenames.contains_m
185 # if the dir contains a matching *.m file, likely objective_c
186 if file_context.filename =~ /\.h$/
187 m_counterpart = file_context.filename.gsub(/\.h$/, ".m")
188 return 'objective_c' if file_context.filenames.include?(m_counterpart)
191 # ok - it just might be objective_c, let's check contents for objective_c signatures
192 objective_c_signatures = /(^\s*@interface)|(^\s*@end)/
193 objective_c += lines_matching(buffer, objective_c_signatures)
195 return objective_c > 1 ? 'objective_c' : 'cncpp'
198 # Tests whether the provided buffer contains binary or text content.
199 # This is not fool-proof -- we basically just check for zero values
200 # in the early bytes of the buffer. If we find a zero, we know it
201 # is not (ascii) text.
202 def self.binary_buffer?(buffer)
204 return true if buffer[i] == 0
209 # True if the provided buffer includes a '?php' directive
210 def self.php_instruction?(buffer)
214 # For *.inc files, checks for a PHP class.
215 def self.disambiguate_inc(file_context)
216 buffer = file_context.contents
217 return nil if binary_buffer?(buffer)
218 return 'php' if php_instruction?(buffer)
222 # For files with extention *.cs, differentiates C# from Clearsilver.
223 def self.disambiguate_cs(file_context)
224 buffer = file_context.contents
225 return 'clearsilver_template' if lines_matching(file_context.contents, /\<\?cs/) > 0
229 # Attempts to determine the Polyglot for files that do not have a
230 # filename extension.
232 # Relies on the bash +file+ command line tool as its primary method.
234 # There must be a file at <tt>file_context.file_location</tt> for +file+
237 def self.disambiguate_nil(file_context)
238 file_location = file_context.file_location
239 output = `file -b #{ file_location }`
241 when /([\w\/]+) script text/, /script text executable for ([\w\/]+)/
243 if script =~ /\/(\w*)$/
246 known_languages = EXTENSION_MAP.values
247 return script.downcase if known_languages.include?(script.downcase)
248 when /([\w\-]*) shell script text/