support Literate Haskell
[ohcount] / lib / ohcount / detector.rb
1 # The Detector determines which Monoglot or Polyglot should be
2 # used to parse a source file.
3 #
4 # The Detector primarily uses filename extensions to identify languages.
5 #
6 # The hash EXTENSION_MAP maps a filename extension to the name of a parser.
7 #
8 # If a filename extension is not enough to determine the correct parser (for
9 # instance, the *.m extension can indicate either a Matlab or Objective-C file),
10 # then the EXTENSION_MAP hash will contain a symbol identifying a Ruby method
11 # which will be invoked. This Ruby method can examine the file
12 # contents and return the name of the correct parser.
13 #
14 # Many source files do not have an extension. The method +disambiguate_nil+
15 # is called in these cases. The +file+ command line tool is used to determine
16 # the type of file and select a parser.
17 #
18 # The Detector is covered by DetectorTest.
19 #
20 class Ohcount::Detector
21
22         module ContainsM
23                 # A performance hack -- once we've checked for the presence of *.m files, the result
24                 # is stored here to avoid checking twice.
25                 attr_accessor :contains_m
26                 # A performance hack -- once we've checked for the presence of *.pike and *.pmod files, the result
27                 # is stored here to avoid checking twice.
28                 attr_accessor :contains_pike_or_pmod
29         end
30
31         # The primary entry point for the detector.
32         # Given a file context containing the file name, content, and an array of
33         # other filenames in the source tree, attempts to detect which
34         # language family (Monoglot or Polyglot) is in use for this file.
35         #
36         # Returns nil if the language is not recognized or if the file does not
37         # contain any code.
38         #
39         # Example:
40         #
41         #   # List all C files in the 'src' directory
42         #   Dir.entries("src").each do |file|
43         #     context = Ohcount::SimpleFileContext.new(file)
44         #     polyglot = Ohcount::Detector.detect(context)
45         #     puts "#{file}" if polyglot == 'c'
46         #   end
47         #
48         def self.detect(file_context)
49                 # start with extension
50                 polyglot = EXTENSION_MAP[File.extname(file_context.filename)]
51                 polyglot = EXTENSION_MAP[File.extname(file_context.filename).downcase] unless polyglot
52     case polyglot
53     when String
54       # simplest case
55                   return polyglot if polyglot.is_a?(String)
56     when Symbol
57                   # extension is ambiguous - requires custom disambiguation
58                         self.send(polyglot, file_context)
59     when NilClass
60       return disambiguate_nil(file_context)
61     else
62       raise RuntimeError.new("Unknown file detection type")
63           end
64   end
65
66         # Based solely on the filename, makes a judgment whether a file is a binary format.
67         def self.binary_filename?(filename)
68                 ignore = [
69                         ".svn",
70                         ".jar",
71                         ".tar",
72                         ".gz",
73                         ".tgz",
74                         ".zip",
75                         ".gif",
76                         ".jpg",
77                         ".jpeg",
78                         ".bmp",
79                         ".png",
80                         ".tif",
81                         ".tiff",
82                         ".ogg",
83                         ".aiff",
84                         ".wav",
85                         ".mp3",
86                         ".au",
87                         ".ra",
88                         ".m4a",
89                         ".pdf",
90                         ".mpg",
91                         ".mov",
92                         ".qt",
93                         ".avi"
94                         ]
95                 ignore.include?(File.extname(filename))
96         end
97
98         # If an extension maps to a string, that string must be the name of a glot.
99         # If an extension maps to a Ruby symbol, that symbol must be the name of a
100         # Ruby method which will return the name of a glot.
101         EXTENSION_MAP = {
102                 '.ada'  => "ada",
103                 '.adb'  => "ada",
104                 '.ads'  => "ada",
105                 '.asm'  => "assembler",
106                 '.awk'  => "awk",
107                 '.bas'  => "visualbasic",
108                 '.bat'  => "bat",
109                 '.boo'  => "boo",
110                 '.c'    => "c",
111                 '.C'    => "cpp",
112                 '.cc'   => "cpp",
113                 '.cpp'  => "cpp",
114                 '.css'  => "css",
115                 '.c++'  => "cpp",
116                 '.cxx'  => "cpp",
117                 '.com'  => "dcl",
118                 '.el'   => "emacslisp",
119                 #               '.cbl'  => "cobol",
120                 #               '.cob'  => "cobol",
121                 '.cs'   => :disambiguate_cs,
122                 '.dylan'=> "dylan",
123                 '.erl'  => "erlang",
124                 '.f'    => :disambiguate_fortran,
125                 '.ftn'  => :disambiguate_fortran,
126                 '.f77'  => :disambiguate_fortran,
127                 '.f90'  => :disambiguate_fortran,
128                 '.f95'  => :disambiguate_fortran,
129                 '.f03'  => :disambiguate_fortran,
130                 '.F'    => :disambiguate_fortran,
131                 '.F77'  => :disambiguate_fortran,
132                 '.F90'  => :disambiguate_fortran,
133                 '.F95'  => :disambiguate_fortran,
134                 '.F03'  => :disambiguate_fortran,
135                 '.frx'  => "visualbasic",
136                 '.groovy'=> "groovy",
137                 '.h'    => :disambiguate_h_header,
138                 '.H'    => "cpp",
139                 '.hpp'  => "cpp",
140                 '.h++'  => "cpp",
141                 '.hs'   => "haskell",
142                 '.hxx'  => "cpp",
143                 '.hh'   => "cpp",
144                 '.hrl'  => "erlang",
145                 '.htm'  => "html",
146                 '.html' => "html",
147                 '.in'   => :disambiguate_in,
148                 '.inc'  => :disambiguate_inc,
149                 '.java' => "java",
150                 '.js'   => "javascript",
151                 '.jsp'  => "jsp",
152                 '.lhs'  => "lit_haskell",
153                 '.lua'  => "lua",
154                 '.lsp'  => "lisp",
155                 '.lisp' => "lisp",
156                 '.m'    => :matlab_or_objective_c,
157                 '.mf'   => 'metafont',
158                 '.mm'   => "objective_c",
159                 '.mp'   => 'metapost_with_tex',
160                 '.pas'  => "pascal",
161                 '.pp'   => "pascal",
162                 '.php'  => "php",
163                 '.php3' => "php",
164                 '.php4' => "php",
165                 '.php5' => "php",
166                 '.pl'   => "perl",
167                 '.pm'   => "perl",
168                 '.perl' => "perl",
169                 '.ph'   => "perl",
170                 '.pod'  => "perl",
171                 '.t'    => "perl",
172                 '.pike' => "pike",
173                 '.pmod' => "pike",
174                 '.py'   => "python",
175                 '.rhtml'=> "rhtml",
176                 '.rb'   => "ruby",
177                 '.rex'  => "rexx",
178                 '.rexx' => "rexx",
179                 '.s'    => "assembler",
180                 '.S'    => "assembler",
181                 '.sc'   => "scheme",
182                 '.scm'  => "scheme",
183                 '.sh'   => "shell",
184                 '.sql'  => "sql",
185                 '.st'   => "smalltalk",
186                 '.tcl'  => "tcl",
187                 '.tpl'  => "html",
188                 '.vala' => "vala",
189                 '.vb'   => "visualbasic",
190                 '.vba'  => "visualbasic",
191                 '.vbs'  => "visualbasic",
192                 '.w'    => "c_web",
193                 '.web'  => "knuth_web",
194                 '.xml'  => "xml",
195                 '.xsd'  => "xmlschema",
196                 '.xsl'  => "xslt",
197                 '.d'            => 'dmd',
198                 '.di'           => 'dmd',
199                 '.tex'  => 'tex',
200                 '.ltx'  => 'tex',
201                 '.latex'=> 'tex'
202         }
203
204         protected
205
206         # Returns a count of lines in the buffer matching the given regular expression.
207         def self.lines_matching(buffer, re)
208                 buffer.inject(0) { |total, line| line =~ re ? total+1 : total }
209         end
210
211         # For *.m files, differentiates Matlab from Objective-C.
212         #
213         # This is done with a weighted heuristic that
214         # scans the *.m file contents for keywords,
215         # and also checks for the presence of matching *.h files.
216   def self.matlab_or_objective_c(file_context)
217     buffer = file_context.contents
218
219     # if there are .h files in same directory, this probably isn't matlab
220     h_headers = 0.0
221     h_headers = -0.5 if file_context.filenames.select { |a| a =~ /\.h$/ }.any?
222
223     # if the contents contain 'function (' on a single line - very likely to be matlab
224     # if the contents contain lines starting with '%', its probably matlab comments
225     matlab_signatures = /(^\s*function\s*)|(^\s*%)/
226     matlab_sig_score = 0.1 * lines_matching(buffer, matlab_signatures)
227
228     # if the contents contains '//' or '/*', likely objective_c
229     objective_c_signatures = /(^\s*\/\/\s*)|(^\s*\/\*)|(^[+-])/
230     obj_c_sig_score = -0.1 * lines_matching(buffer, objective_c_signatures)
231
232     matlab = h_headers + matlab_sig_score + obj_c_sig_score
233
234     matlab > 0 ? 'matlab' : 'objective_c'
235   end
236
237         # For *.h files, differentiates C, C++ and Objective-C.
238         #
239         # This is done with a weighted heuristic that
240         # scans the *.h file contents for Objective-C keywords,
241         # C++ keywords and C++ headers, and also checks for the
242         # presence of matching *.m files.
243         def self.disambiguate_h_header(file_context)
244     buffer = file_context.contents
245
246     # could it be realistically be objective_c ? are there any .m files at all?
247     # Speed hack - remember our findings in case we get the same filenames over and over
248     unless defined?(file_context.filenames.contains_m)
249       file_context.filenames.extend(ContainsM)
250       file_context.filenames.contains_m = file_context.filenames.select { |a| a =~ /\.m$/ }.any?
251       file_context.filenames.contains_pike_or_pmod = file_context.filenames.select { |a| a =~ /\.p(ike|mod)$/ }.any?
252     end
253
254     if file_context.filenames.contains_m
255       # if the dir contains a matching *.m file, likely objective_c
256       if file_context.filename =~ /\.h$/
257         m_counterpart = file_context.filename.gsub(/\.h$/, ".m")
258         return 'objective_c' if file_context.filenames.include?(m_counterpart)
259       end
260
261       # ok - it just might be objective_c, let's check contents for objective_c signatures
262       objective_c_signatures = /(^\s*@interface)|(^\s*@end)/
263       objective_c = lines_matching(buffer, objective_c_signatures)
264       return 'objective_c' if objective_c > 1
265     end
266
267     if file_context.filenames.contains_pike_or_pmod
268       # The string "pike" and a selection of common Pike keywords.
269       pike_signatures = /([Pp][Ii][Kk][Ee])|(string )|(mapping)|(multiset)|(import )|(inherit )|(predef)/
270       pike = lines_matching(buffer, pike_signatures)
271       return 'pike' if pike > 0
272     end
273
274     disambiguate_c_cpp(buffer)
275         end
276
277         # A map of headers that indicate C++, but that do not have C++-specific file
278         # extensions. This list is made from the Standard, plus Technical Report 1.
279         CPP_HEADERS_MAP = %w[
280                 algorithm
281                 array
282                 bitset
283                 cassert
284                 ccomplex
285                 cctype
286                 cerrno
287                 cfenv
288                 cfloat
289                 cinttypes
290                 ciso646
291                 climits
292                 clocale
293                 cmath
294                 csetjmp
295                 csignal
296                 cstdarg
297                 cstdbool
298                 cstddef
299                 cstdint
300                 cstdio
301                 cstdlib
302                 cstring
303                 ctgmath
304                 ctime
305                 cwchar
306                 cwctype
307                 deque
308                 exception
309                 fstream
310                 functional
311                 iomanip
312                 ios
313                 iosfwd
314                 iostream
315                 istream
316                 iterator
317                 limits
318                 list
319                 locale
320                 map
321                 memory
322                 new
323                 numeric
324                 ostream
325                 queue
326                 random
327                 regex
328                 set
329                 sstream
330                 stack
331                 stdexcept
332                 streambuf
333                 string
334                 system_error
335                 tuple
336                 type_traits
337                 typeinfo
338                 unordered_map
339                 unordered_set
340                 utility
341                 valarray
342                 vector
343                 tr1/array
344                 tr1/ccomplex
345                 tr1/cctype
346                 tr1/cfenv
347                 tr1/cfloat
348                 tr1/cinttypes
349                 tr1/climits
350                 tr1/cmath
351                 tr1/complex
352                 tr1/cstdarg
353                 tr1/cstdbool
354                 tr1/cstdint
355                 tr1/cstdio
356                 tr1/cstdlib
357                 tr1/ctgmath
358                 tr1/ctime
359                 tr1/cwchar
360                 tr1/cwctype
361                 tr1/memory
362                 tr1/random
363                 tr1/regex
364                 tr1/tuple
365                 tr1/type_traits
366                 tr1/unordered_map
367                 tr1/unordered_set
368                 tr1/utility
369         ].inject({}) { | h, k | h[k] = true ; h }
370
371         # A map of keywords that indicate C++.
372         CPP_KEYWORDS_MAP = %w[
373                 template
374                 typename
375                 class
376                 namespace
377         ].inject({}) { | h, k | h[k] = true ; h }
378
379         # For *.h files that we know aren't Objective-C, differentiates C and C++.
380         #
381         # This is done with a weighted heuristic that
382         # scans the *.h file contents for C++ keywords and C++ headers.
383         def self.disambiguate_c_cpp(buffer)
384                 # Look for C++ headers
385                 return 'cpp' if extract_c_cpp_headers(buffer).detect do | header |
386                         EXTENSION_MAP[File.extname(header)] == 'cpp' or CPP_HEADERS_MAP.include? header
387                 end
388
389                 # Look for C++ keywords. This could check for comments, but doesn't.
390                 return 'cpp' if buffer.find do | line |
391                         line.split(/\W/).find do | word |
392                                 CPP_KEYWORDS_MAP.include? word
393                         end
394                 end
395
396                 # Nothing to suggest C++
397                 'c'
398         end
399
400         # Return a list of files included in a C or C++ source file.
401         def self.extract_c_cpp_headers(buffer)
402                 buffer.map do | line |
403                         m = line.match(/^#\s*include\s+[<"](.*)[>"]/) and m[1]
404                 end.find_all { | a | a }
405         end
406
407         # Tests whether the provided buffer contains binary or text content.
408         # This is not fool-proof -- we basically just check for zero values
409         # in the early bytes of the buffer. If we find a zero, we know it
410         # is not (ascii) text.
411   def self.binary_buffer?(buffer)
412     100.times do |i|
413       return true if buffer[i] == 0
414     end
415     false
416   end
417
418         # True if the provided buffer includes a '?php' directive
419   def self.php_instruction?(buffer)
420     buffer =~ /\?php/
421   end
422
423         # For *.in files, checks the prior extension.
424         # Typically used for template files (eg Makefile.in, auto.c.in, etc).
425   def self.disambiguate_in(file_context)
426     # if the filename has an extension prior to the .in
427     if file_context.filename =~ /\..*\.in$/
428       filename = file_context.filename.gsub(/\.in$/, "")
429       context = Ohcount::SimpleFileContext.new(filename, file_context.filenames, file_context.contents, file_context.file_location)
430       return detect(context)
431     end
432     nil
433   end
434
435         # For *.inc files, checks for a PHP class.
436   def self.disambiguate_inc(file_context)
437     buffer = file_context.contents
438     return nil if binary_buffer?(buffer)
439     return 'php' if php_instruction?(buffer)
440     nil
441   end
442
443         # For files with extention *.cs, differentiates C# from Clearsilver.
444   def self.disambiguate_cs(file_context)
445     buffer = file_context.contents
446     return 'clearsilver_template' if lines_matching(file_context.contents, /\<\?cs/) > 0
447     return 'csharp'
448   end
449
450   def self.disambiguate_fortran(file_context)
451     buffer = file_context.contents
452
453     definitely_not_f77 = /^ [^0-9 ]{5}/
454     return 'fortranfixed' if lines_matching(buffer, definitely_not_f77) > 0
455
456     free_form_continuation = /&\s*\n\s*&/m
457     return 'fortranfree' if buffer.match(free_form_continuation)
458
459     possibly_fixed = /^ [0-9 ]{5}/
460     contig_number = /^\s*\d+\s*$/
461     buffer.scan(possibly_fixed) {|leader|
462       return 'fortranfixed' if !(leader =~ contig_number) }
463     # Might as well be free-form.
464     return 'fortranfree'
465   end
466
467         # Attempts to determine the Polyglot for files that do not have a
468         # filename extension.
469         #
470         # Relies on the bash +file+ command line tool as its primary method.
471         #
472         # There must be a file at <tt>file_context.file_location</tt> for +file+
473         # to operate on.
474         #
475   def self.disambiguate_nil(file_context)
476     file_location = file_context.file_location
477     output = `file -b #{ file_location }`
478     case output
479     when /([\w\/]+) script text/, /script text executable for ([\w\/]+)/
480       script = $1
481       if script =~ /\/(\w*)$/
482         script = $1
483       end
484       known_languages = EXTENSION_MAP.values
485       return script.downcase if known_languages.include?(script.downcase)
486     when /([\w\-]*) shell script text/
487       case $1
488       when "Bourne-Again"
489         return "shell"
490       end
491     end
492
493     # dang... no dice
494     nil
495   end
496
497 end