Initial Revision
[ohcount] / lib / ohcount / detector.rb
1 # Determines the language family (Monoglot or Polyglot) used by a source file.
2 class Ohcount::Detector
3
4         module ContainsM
5                 # A performance hack -- once we've checked for the presence of *.m files, the result
6                 # is stored here to avoid checking twice.
7                 attr_accessor :contains_m
8         end
9
10         # The primary entry point for the detector.
11         # Given a file context containing the file name, content, and an array of
12         # other filenames in the source tree, attempts to detect which
13         # language family (Monoglot or Polyglot) is in use for this file.
14         #
15         # Returns nil if the language is not recognized or if the file does not
16         # contain any code.
17         def self.detect(file_context)
18                 # start with extension
19                 polyglot = EXTENSION_MAP[File.extname(file_context.filename).downcase]
20     case polyglot
21     when String
22       # simplest case
23                   return polyglot if polyglot.is_a?(String)
24     when Symbol
25                   # extension is ambiguous - requires custom disambiguation
26                         self.send(polyglot, file_context)
27     when NilClass
28       return disambiguate_nil(file_context)
29     else
30       raise RuntimeError.new("Unknown file detection type")
31           end
32   end
33
34         # Based solely on the filename, makes a judgment whether a file is a binary format.
35         def self.binary_filename?(filename)
36                 ignore = [
37                         ".svn",
38                         ".jar",
39                         ".tar",
40                         ".gz",
41                         ".tgz",
42                         ".zip",
43                         ".gif",
44                         ".jpg",
45                         ".jpeg",
46                         ".bmp",
47                         ".png",
48                         ".tif",
49                         ".tiff",
50                         ".ogg",
51                         ".aiff",
52                         ".wav",
53                         ".mp3",
54                         ".au",
55                         ".ra",
56                         ".m4a",
57                         ".pdf",
58                         ".mpg",
59                         ".mov",
60                         ".qt",
61                         ".avi"
62                         ]
63                 ignore.include?(File.extname(filename))
64         end
65
66         EXTENSION_MAP = {
67                 '.ada'  => "ada",
68                 '.adb'  => "ada",
69                 '.asm'  => "assembler",
70                 '.awk'  => "awk",
71                 '.bas'  => "visualbasic",
72                 '.bat'  => "bat",
73                 '.boo'  => "boo",
74                 '.c'    => "cncpp",
75                 '.cc'   => "cncpp",
76                 '.cpp'  => "cncpp",
77                 '.css'  => "css",
78                 '.c++'  => "cncpp",
79                 '.cxx'  => "cncpp",
80                 '.el'   => "emacslisp",
81                 #               '.cbl'  => "cobol",
82                 #               '.cob'  => "cobol",
83                 '.cs'   => :disambiguate_cs,
84                 '.dylan'=> "dylan",
85                 '.erl'  => "erlang",
86                 '.frx'  => "visualbasic",
87                 '.groovy'=> "groovy",
88                 '.h'    => :disambiguate_h_header,
89                 '.hpp'  => "cncpp",
90                 '.hh'   => "cncpp",
91                 '.hrl'  => "erlang",
92                 '.htm'  => "html",
93                 '.html' => "html",
94                 '.inc'  => :disambiguate_inc,
95                 '.java' => "java",
96                 '.js'   => "javascript",
97                 '.jsp'  => "jsp",
98                 '.lua'  => "lua",
99                 '.lsp'  => "lisp",
100                 '.lisp' => "lisp",
101                 '.m'    => :matlab_or_objective_c,
102                 '.mm'   => "objective_c",
103                 '.pas'  => "pascal",
104                 '.pp'   => "pascal",
105                 '.php'  => "php",
106                 '.php3' => "php",
107                 '.php4' => "php",
108                 '.php5' => "php",
109                 '.pl'   => "perl",
110                 '.pm'   => "perl",
111                 '.perl' => "perl",
112                 '.ph'   => "perl",
113                 '.py'   => "python",
114                 '.rhtml'=> "rhtml",
115                 '.rb'   => "ruby",
116                 '.rex'  => "rexx",
117                 '.rexx' => "rexx",
118                 '.s'    => "assembler",
119                 '.S'    => "assembler",
120                 '.sc'   => "scheme",
121                 '.scm'  => "scheme",
122                 '.sh'   => "shell",
123                 '.sql'  => "sql",
124                 '.tcl'  => "tcl",
125                 '.tpl'  => "html",
126                 '.vb'   => "visualbasic",
127                 '.vba'  => "visualbasic",
128                 '.vbs'  => "visualbasic",
129                 '.xml'  => "xml",
130                 '.d'            => 'dmd',
131                 '.di'           => 'dmd'
132         }
133
134         protected
135
136         # Returns a count of lines in the buffer matching the given regular expression.
137         def self.lines_matching(buffer, re)
138                 buffer.inject(0) { |total, line| line =~ re ? total+1 : total }
139         end
140
141         # For *.m files, differentiates Matlab from Objective-C.
142         #
143         # This is done with a weighted heuristic that
144         # scans the *.m file contents for keywords,
145         # and also checks for the presence of matching *.h files.
146   def self.matlab_or_objective_c(file_context)
147     buffer = file_context.contents
148
149     # if there are .h files in same directory, this probably isn't matlab
150     h_headers = 0.0
151     h_headers = -0.5 if file_context.filenames.select { |a| a =~ /\.h$/ }.any?
152
153     # if the contents contain 'function (' on a single line - very likely to be matlab
154     # if the contents contain lines starting with '%', its probably matlab comments
155     matlab_signatures = /(^\s*function\s*)|(^\s*%)/
156     matlab_sig_score = 0.1 * lines_matching(buffer, matlab_signatures)
157
158     # if the contents contains '//' or '/*', likely objective_c
159     objective_c_signatures = /(^\s*\/\/\s*)|(^\s*\/\*)|(^[+-])/
160     obj_c_sig_score = -0.1 * lines_matching(buffer, objective_c_signatures)
161
162     matlab = h_headers + matlab_sig_score + obj_c_sig_score
163
164     matlab > 0 ? 'matlab' : 'objective_c'
165   end
166
167         # For *.h files, differentiates C/C++ from Objective-C.
168         #
169         # This is done with a weighted heuristic that
170         # scans the *.h file contents for Objective-C keywords,
171         # and also checks for the presence of matching *.m files.
172         def self.disambiguate_h_header(file_context)
173     buffer = file_context.contents
174
175     objective_c = 0
176
177     # could it be realistically be objective_c ? are there any .m files at all?
178     # Speed hack - remember our findings in case we get the same filenames over and over
179     unless defined?(file_context.filenames.contains_m)
180       file_context.filenames.extend(ContainsM)
181       file_context.filenames.contains_m = file_context.filenames.select { |a| a =~ /\.m$/ }.any?
182     end
183     return 'cncpp' unless file_context.filenames.contains_m
184
185     # if the dir contains a matching *.m file, likely objective_c
186     if file_context.filename =~ /\.h$/
187       m_counterpart = file_context.filename.gsub(/\.h$/, ".m")
188       return 'objective_c' if file_context.filenames.include?(m_counterpart)
189     end
190
191     # ok - it just might be objective_c, let's check contents for objective_c signatures
192     objective_c_signatures = /(^\s*@interface)|(^\s*@end)/
193     objective_c += lines_matching(buffer, objective_c_signatures)
194
195     return objective_c > 1 ? 'objective_c' : 'cncpp'
196         end
197
198         # Tests whether the provided buffer contains binary or text content.
199         # This is not fool-proof -- we basically just check for zero values
200         # in the early bytes of the buffer. If we find a zero, we know it
201         # is not (ascii) text.
202   def self.binary_buffer?(buffer)
203     100.times do |i|
204       return true if buffer[i] == 0
205     end
206     false
207   end
208
209         # True if the provided buffer includes a '?php' directive
210   def self.php_instruction?(buffer)
211     buffer =~ /\?php/
212   end
213
214         # For *.inc files, checks for a PHP class.
215   def self.disambiguate_inc(file_context)
216     buffer = file_context.contents
217     return nil if binary_buffer?(buffer)
218     return 'php' if php_instruction?(buffer)
219     nil
220   end
221
222         # For files with extention *.cs, differentiates C# from Clearsilver.
223   def self.disambiguate_cs(file_context)
224     buffer = file_context.contents
225     return 'clearsilver_template' if lines_matching(file_context.contents, /\<\?cs/) > 0
226     return 'csharp'
227   end
228
229         # Attempts to determine the Polyglot for files that do not have a
230         # filename extension.
231         #
232         # Relies on the bash +file+ command line tool as its primary method.
233         #
234         # There must be a file at <tt>file_context.file_location</tt> for +file+
235         # to operate on.
236         #
237   def self.disambiguate_nil(file_context)
238     file_location = file_context.file_location
239     output = `file -b #{ file_location }`
240     case output
241     when /([\w\/]+) script text/, /script text executable for ([\w\/]+)/
242       script = $1
243       if script =~ /\/(\w*)$/
244         script = $1
245       end
246       known_languages = EXTENSION_MAP.values
247       return script.downcase if known_languages.include?(script.downcase)
248     when /([\w\-]*) shell script text/
249       case $1
250       when "Bourne-Again"
251         return "shell"
252       end
253     end
254
255     # dang... no dice
256     nil
257   end
258
259 end