support CWEB (literate programming for C)
[ohcount] / lib / ohcount / detector.rb
1 # The Detector determines which Monoglot or Polyglot should be
2 # used to parse a source file.
3 #
4 # The Detector primarily uses filename extensions to identify languages.
5 #
6 # The hash EXTENSION_MAP maps a filename extension to the name of a parser.
7 #
8 # If a filename extension is not enough to determine the correct parser (for
9 # instance, the *.m extension can indicate either a Matlab or Objective-C file),
10 # then the EXTENSION_MAP hash will contain a symbol identifying a Ruby method
11 # which will be invoked. This Ruby method can examine the file
12 # contents and return the name of the correct parser.
13 #
14 # Many source files do not have an extension. The method +disambiguate_nil+
15 # is called in these cases. The +file+ command line tool is used to determine
16 # the type of file and select a parser.
17 #
18 # The Detector is covered by DetectorTest.
19 #
20 class Ohcount::Detector
21
22         module ContainsM
23                 # A performance hack -- once we've checked for the presence of *.m files, the result
24                 # is stored here to avoid checking twice.
25                 attr_accessor :contains_m
26                 # A performance hack -- once we've checked for the presence of *.pike and *.pmod files, the result
27                 # is stored here to avoid checking twice.
28                 attr_accessor :contains_pike_or_pmod
29         end
30
31         # The primary entry point for the detector.
32         # Given a file context containing the file name, content, and an array of
33         # other filenames in the source tree, attempts to detect which
34         # language family (Monoglot or Polyglot) is in use for this file.
35         #
36         # Returns nil if the language is not recognized or if the file does not
37         # contain any code.
38         #
39         # Example:
40         #
41         #   # List all C files in the 'src' directory
42         #   Dir.entries("src").each do |file|
43         #     context = Ohcount::SimpleFileContext.new(file)
44         #     polyglot = Ohcount::Detector.detect(context)
45         #     puts "#{file}" if polyglot == 'c'
46         #   end
47         #
48         def self.detect(file_context)
49                 # start with extension
50                 polyglot = EXTENSION_MAP[File.extname(file_context.filename)]
51                 polyglot = EXTENSION_MAP[File.extname(file_context.filename).downcase] unless polyglot
52     case polyglot
53     when String
54       # simplest case
55                   return polyglot if polyglot.is_a?(String)
56     when Symbol
57                   # extension is ambiguous - requires custom disambiguation
58                         self.send(polyglot, file_context)
59     when NilClass
60       return disambiguate_nil(file_context)
61     else
62       raise RuntimeError.new("Unknown file detection type")
63           end
64   end
65
66         # Based solely on the filename, makes a judgment whether a file is a binary format.
67         def self.binary_filename?(filename)
68                 ignore = [
69                         ".svn",
70                         ".jar",
71                         ".tar",
72                         ".gz",
73                         ".tgz",
74                         ".zip",
75                         ".gif",
76                         ".jpg",
77                         ".jpeg",
78                         ".bmp",
79                         ".png",
80                         ".tif",
81                         ".tiff",
82                         ".ogg",
83                         ".aiff",
84                         ".wav",
85                         ".mp3",
86                         ".au",
87                         ".ra",
88                         ".m4a",
89                         ".pdf",
90                         ".mpg",
91                         ".mov",
92                         ".qt",
93                         ".avi"
94                         ]
95                 ignore.include?(File.extname(filename))
96         end
97
98         # If an extension maps to a string, that string must be the name of a glot.
99         # If an extension maps to a Ruby symbol, that symbol must be the name of a
100         # Ruby method which will return the name of a glot.
101         EXTENSION_MAP = {
102                 '.ada'  => "ada",
103                 '.adb'  => "ada",
104                 '.ads'  => "ada",
105                 '.asm'  => "assembler",
106                 '.awk'  => "awk",
107                 '.bas'  => "visualbasic",
108                 '.bat'  => "bat",
109                 '.boo'  => "boo",
110                 '.c'    => "c",
111                 '.C'    => "cpp",
112                 '.cc'   => "cpp",
113                 '.cpp'  => "cpp",
114                 '.css'  => "css",
115                 '.c++'  => "cpp",
116                 '.cxx'  => "cpp",
117                 '.com'  => "dcl",
118                 '.el'   => "emacslisp",
119                 #               '.cbl'  => "cobol",
120                 #               '.cob'  => "cobol",
121                 '.cs'   => :disambiguate_cs,
122                 '.dylan'=> "dylan",
123                 '.erl'  => "erlang",
124                 '.f'    => :disambiguate_fortran,
125                 '.ftn'  => :disambiguate_fortran,
126                 '.f77'  => :disambiguate_fortran,
127                 '.f90'  => :disambiguate_fortran,
128                 '.f95'  => :disambiguate_fortran,
129                 '.f03'  => :disambiguate_fortran,
130                 '.F'    => :disambiguate_fortran,
131                 '.F77'  => :disambiguate_fortran,
132                 '.F90'  => :disambiguate_fortran,
133                 '.F95'  => :disambiguate_fortran,
134                 '.F03'  => :disambiguate_fortran,
135                 '.frx'  => "visualbasic",
136                 '.groovy'=> "groovy",
137                 '.h'    => :disambiguate_h_header,
138                 '.H'    => "cpp",
139                 '.hpp'  => "cpp",
140                 '.h++'  => "cpp",
141                 '.hs'   => "haskell",
142                 '.hxx'  => "cpp",
143                 '.hh'   => "cpp",
144                 '.hrl'  => "erlang",
145                 '.htm'  => "html",
146                 '.html' => "html",
147                 '.in'   => :disambiguate_in,
148                 '.inc'  => :disambiguate_inc,
149                 '.java' => "java",
150                 '.js'   => "javascript",
151                 '.jsp'  => "jsp",
152                 '.lua'  => "lua",
153                 '.lsp'  => "lisp",
154                 '.lisp' => "lisp",
155                 '.m'    => :matlab_or_objective_c,
156                 '.mf'   => 'metafont',
157                 '.mm'   => "objective_c",
158                 '.mp'   => 'metapost_with_tex',
159                 '.pas'  => "pascal",
160                 '.pp'   => "pascal",
161                 '.php'  => "php",
162                 '.php3' => "php",
163                 '.php4' => "php",
164                 '.php5' => "php",
165                 '.pl'   => "perl",
166                 '.pm'   => "perl",
167                 '.perl' => "perl",
168                 '.ph'   => "perl",
169                 '.pod'  => "perl",
170                 '.t'    => "perl",
171                 '.pike' => "pike",
172                 '.pmod' => "pike",
173                 '.py'   => "python",
174                 '.rhtml'=> "rhtml",
175                 '.rb'   => "ruby",
176                 '.rex'  => "rexx",
177                 '.rexx' => "rexx",
178                 '.s'    => "assembler",
179                 '.S'    => "assembler",
180                 '.sc'   => "scheme",
181                 '.scm'  => "scheme",
182                 '.sh'   => "shell",
183                 '.sql'  => "sql",
184                 '.st'   => "smalltalk",
185                 '.tcl'  => "tcl",
186                 '.tpl'  => "html",
187                 '.vala' => "vala",
188                 '.vb'   => "visualbasic",
189                 '.vba'  => "visualbasic",
190                 '.vbs'  => "visualbasic",
191                 '.w'    => "c_web",
192                 '.web'  => "knuth_web",
193                 '.xml'  => "xml",
194                 '.xsd'  => "xmlschema",
195                 '.xsl'  => "xslt",
196                 '.d'            => 'dmd',
197                 '.di'           => 'dmd',
198                 '.tex'  => 'tex',
199                 '.ltx'  => 'tex',
200                 '.latex'=> 'tex'
201         }
202
203         protected
204
205         # Returns a count of lines in the buffer matching the given regular expression.
206         def self.lines_matching(buffer, re)
207                 buffer.inject(0) { |total, line| line =~ re ? total+1 : total }
208         end
209
210         # For *.m files, differentiates Matlab from Objective-C.
211         #
212         # This is done with a weighted heuristic that
213         # scans the *.m file contents for keywords,
214         # and also checks for the presence of matching *.h files.
215   def self.matlab_or_objective_c(file_context)
216     buffer = file_context.contents
217
218     # if there are .h files in same directory, this probably isn't matlab
219     h_headers = 0.0
220     h_headers = -0.5 if file_context.filenames.select { |a| a =~ /\.h$/ }.any?
221
222     # if the contents contain 'function (' on a single line - very likely to be matlab
223     # if the contents contain lines starting with '%', its probably matlab comments
224     matlab_signatures = /(^\s*function\s*)|(^\s*%)/
225     matlab_sig_score = 0.1 * lines_matching(buffer, matlab_signatures)
226
227     # if the contents contains '//' or '/*', likely objective_c
228     objective_c_signatures = /(^\s*\/\/\s*)|(^\s*\/\*)|(^[+-])/
229     obj_c_sig_score = -0.1 * lines_matching(buffer, objective_c_signatures)
230
231     matlab = h_headers + matlab_sig_score + obj_c_sig_score
232
233     matlab > 0 ? 'matlab' : 'objective_c'
234   end
235
236         # For *.h files, differentiates C, C++ and Objective-C.
237         #
238         # This is done with a weighted heuristic that
239         # scans the *.h file contents for Objective-C keywords,
240         # C++ keywords and C++ headers, and also checks for the
241         # presence of matching *.m files.
242         def self.disambiguate_h_header(file_context)
243     buffer = file_context.contents
244
245     # could it be realistically be objective_c ? are there any .m files at all?
246     # Speed hack - remember our findings in case we get the same filenames over and over
247     unless defined?(file_context.filenames.contains_m)
248       file_context.filenames.extend(ContainsM)
249       file_context.filenames.contains_m = file_context.filenames.select { |a| a =~ /\.m$/ }.any?
250       file_context.filenames.contains_pike_or_pmod = file_context.filenames.select { |a| a =~ /\.p(ike|mod)$/ }.any?
251     end
252
253     if file_context.filenames.contains_m
254       # if the dir contains a matching *.m file, likely objective_c
255       if file_context.filename =~ /\.h$/
256         m_counterpart = file_context.filename.gsub(/\.h$/, ".m")
257         return 'objective_c' if file_context.filenames.include?(m_counterpart)
258       end
259
260       # ok - it just might be objective_c, let's check contents for objective_c signatures
261       objective_c_signatures = /(^\s*@interface)|(^\s*@end)/
262       objective_c = lines_matching(buffer, objective_c_signatures)
263       return 'objective_c' if objective_c > 1
264     end
265
266     if file_context.filenames.contains_pike_or_pmod
267       # The string "pike" and a selection of common Pike keywords.
268       pike_signatures = /([Pp][Ii][Kk][Ee])|(string )|(mapping)|(multiset)|(import )|(inherit )|(predef)/
269       pike = lines_matching(buffer, pike_signatures)
270       return 'pike' if pike > 0
271     end
272
273     disambiguate_c_cpp(buffer)
274         end
275
276         # A map of headers that indicate C++, but that do not have C++-specific file
277         # extensions. This list is made from the Standard, plus Technical Report 1.
278         CPP_HEADERS_MAP = %w[
279                 algorithm
280                 array
281                 bitset
282                 cassert
283                 ccomplex
284                 cctype
285                 cerrno
286                 cfenv
287                 cfloat
288                 cinttypes
289                 ciso646
290                 climits
291                 clocale
292                 cmath
293                 csetjmp
294                 csignal
295                 cstdarg
296                 cstdbool
297                 cstddef
298                 cstdint
299                 cstdio
300                 cstdlib
301                 cstring
302                 ctgmath
303                 ctime
304                 cwchar
305                 cwctype
306                 deque
307                 exception
308                 fstream
309                 functional
310                 iomanip
311                 ios
312                 iosfwd
313                 iostream
314                 istream
315                 iterator
316                 limits
317                 list
318                 locale
319                 map
320                 memory
321                 new
322                 numeric
323                 ostream
324                 queue
325                 random
326                 regex
327                 set
328                 sstream
329                 stack
330                 stdexcept
331                 streambuf
332                 string
333                 system_error
334                 tuple
335                 type_traits
336                 typeinfo
337                 unordered_map
338                 unordered_set
339                 utility
340                 valarray
341                 vector
342                 tr1/array
343                 tr1/ccomplex
344                 tr1/cctype
345                 tr1/cfenv
346                 tr1/cfloat
347                 tr1/cinttypes
348                 tr1/climits
349                 tr1/cmath
350                 tr1/complex
351                 tr1/cstdarg
352                 tr1/cstdbool
353                 tr1/cstdint
354                 tr1/cstdio
355                 tr1/cstdlib
356                 tr1/ctgmath
357                 tr1/ctime
358                 tr1/cwchar
359                 tr1/cwctype
360                 tr1/memory
361                 tr1/random
362                 tr1/regex
363                 tr1/tuple
364                 tr1/type_traits
365                 tr1/unordered_map
366                 tr1/unordered_set
367                 tr1/utility
368         ].inject({}) { | h, k | h[k] = true ; h }
369
370         # A map of keywords that indicate C++.
371         CPP_KEYWORDS_MAP = %w[
372                 template
373                 typename
374                 class
375                 namespace
376         ].inject({}) { | h, k | h[k] = true ; h }
377
378         # For *.h files that we know aren't Objective-C, differentiates C and C++.
379         #
380         # This is done with a weighted heuristic that
381         # scans the *.h file contents for C++ keywords and C++ headers.
382         def self.disambiguate_c_cpp(buffer)
383                 # Look for C++ headers
384                 return 'cpp' if extract_c_cpp_headers(buffer).detect do | header |
385                         EXTENSION_MAP[File.extname(header)] == 'cpp' or CPP_HEADERS_MAP.include? header
386                 end
387
388                 # Look for C++ keywords. This could check for comments, but doesn't.
389                 return 'cpp' if buffer.find do | line |
390                         line.split(/\W/).find do | word |
391                                 CPP_KEYWORDS_MAP.include? word
392                         end
393                 end
394
395                 # Nothing to suggest C++
396                 'c'
397         end
398
399         # Return a list of files included in a C or C++ source file.
400         def self.extract_c_cpp_headers(buffer)
401                 buffer.map do | line |
402                         m = line.match(/^#\s*include\s+[<"](.*)[>"]/) and m[1]
403                 end.find_all { | a | a }
404         end
405
406         # Tests whether the provided buffer contains binary or text content.
407         # This is not fool-proof -- we basically just check for zero values
408         # in the early bytes of the buffer. If we find a zero, we know it
409         # is not (ascii) text.
410   def self.binary_buffer?(buffer)
411     100.times do |i|
412       return true if buffer[i] == 0
413     end
414     false
415   end
416
417         # True if the provided buffer includes a '?php' directive
418   def self.php_instruction?(buffer)
419     buffer =~ /\?php/
420   end
421
422         # For *.in files, checks the prior extension.
423         # Typically used for template files (eg Makefile.in, auto.c.in, etc).
424   def self.disambiguate_in(file_context)
425     # if the filename has an extension prior to the .in
426     if file_context.filename =~ /\..*\.in$/
427       filename = file_context.filename.gsub(/\.in$/, "")
428       context = Ohcount::SimpleFileContext.new(filename, file_context.filenames, file_context.contents, file_context.file_location)
429       return detect(context)
430     end
431     nil
432   end
433
434         # For *.inc files, checks for a PHP class.
435   def self.disambiguate_inc(file_context)
436     buffer = file_context.contents
437     return nil if binary_buffer?(buffer)
438     return 'php' if php_instruction?(buffer)
439     nil
440   end
441
442         # For files with extention *.cs, differentiates C# from Clearsilver.
443   def self.disambiguate_cs(file_context)
444     buffer = file_context.contents
445     return 'clearsilver_template' if lines_matching(file_context.contents, /\<\?cs/) > 0
446     return 'csharp'
447   end
448
449   def self.disambiguate_fortran(file_context)
450     buffer = file_context.contents
451
452     definitely_not_f77 = /^ [^0-9 ]{5}/
453     return 'fortranfixed' if lines_matching(buffer, definitely_not_f77) > 0
454
455     free_form_continuation = /&\s*\n\s*&/m
456     return 'fortranfree' if buffer.match(free_form_continuation)
457
458     possibly_fixed = /^ [0-9 ]{5}/
459     contig_number = /^\s*\d+\s*$/
460     buffer.scan(possibly_fixed) {|leader|
461       return 'fortranfixed' if !(leader =~ contig_number) }
462     # Might as well be free-form.
463     return 'fortranfree'
464   end
465
466         # Attempts to determine the Polyglot for files that do not have a
467         # filename extension.
468         #
469         # Relies on the bash +file+ command line tool as its primary method.
470         #
471         # There must be a file at <tt>file_context.file_location</tt> for +file+
472         # to operate on.
473         #
474   def self.disambiguate_nil(file_context)
475     file_location = file_context.file_location
476     output = `file -b #{ file_location }`
477     case output
478     when /([\w\/]+) script text/, /script text executable for ([\w\/]+)/
479       script = $1
480       if script =~ /\/(\w*)$/
481         script = $1
482       end
483       known_languages = EXTENSION_MAP.values
484       return script.downcase if known_languages.include?(script.downcase)
485     when /([\w\-]*) shell script text/
486       case $1
487       when "Bourne-Again"
488         return "shell"
489       end
490     end
491
492     # dang... no dice
493     nil
494   end
495
496 end