1 // detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
2 // See COPYING for license information.
10 #include "languages.h"
12 #include "hash/cppheader_hash.h"
13 #include "hash/disambiguatefunc_hash.h"
14 #include "hash/extension_hash.h"
15 #include "hash/filename_hash.h"
16 #include "hash/language_hash.h"
18 #define ISAMBIGUOUS(x) (x[0] == '\1')
19 #define DISAMBIGUATEWHAT(x) &x[1]
21 const char *ohcount_detect_language(SourceFile *sourcefile) {
22 const char *language = NULL;
26 // Attempt to detect based on file extension.
27 length = strlen(sourcefile->ext);
28 struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
30 if (re) language = re->value;
31 if (language == NULL) {
32 // Try the lower-case version of this extension.
33 char lowerext[length];
34 strncpy(lowerext, sourcefile->ext, length);
35 lowerext[length] = '\0';
36 for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
37 struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
38 if (re) return re->value;
41 if (ISAMBIGUOUS(language)) {
42 // Call the appropriate function for disambiguation.
43 length = strlen(DISAMBIGUATEWHAT(language));
44 struct DisambiguateFuncsMap *rd =
45 ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
47 if (rd) return rd->value(sourcefile);
48 } else return language;
51 // Attempt to detect based on filename.
52 length = strlen(sourcefile->filename);
53 struct FilenameMap *rf =
54 ohcount_hash_language_from_filename(sourcefile->filename, length);
55 if (rf) return rf->value;
57 char line[81], buf[81];
59 // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
60 p = ohcount_sourcefile_get_contents(sourcefile);
62 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
64 // Get the contents of the first line.
65 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
66 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
67 strncpy(line, p, length);
69 if (*line == '#' && *(line + 1) == '!') {
70 // First line was sh-bang; loop to get contents of second line.
71 while (*pe == '\r' || *pe == '\n') pe++;
75 for (p = line; p < line + strlen(line); p++) *p = tolower(*p);
76 p = strstr(line, "-*-");
79 while (*p == ' ' || *p == '\t') p++;
80 if (strncmp(p, "mode", 4) == 0) {
82 while (*p == ' ' || *p == '\t' || *p == ':') p++;
85 while (isalnum(*pe)) pe++;
87 strncpy(buf, p, length);
89 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
90 if (rl) return rl->value;
93 // Attempt to detect based on Unix 'file' command.
94 char *path = sourcefile->filepath;
95 if (sourcefile->diskpath)
96 path = sourcefile->diskpath;
97 char command[strlen(path) + 10];
98 sprintf(command, "file -b '%s'", path);
99 FILE *f = popen(command, "r");
101 fgets(line, sizeof(line), f);
102 for (p = line; p < line + strlen(line); p++) *p = tolower(*p);
103 p = strstr(line, "script text");
104 if (p && p == line) { // /^script text(?: executable)? for \w/
105 p = strstr(line, "for ");
109 while (isalnum(*pe)) pe++;
111 strncpy(buf, p, length);
113 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
114 if (rl) language = rl->value;
116 } else if (p) { // /(\w+)(?: -\w+)* script text/
120 while (*p == ' ') p--;
121 while (p != line && isalnum(*(p - 1))) p--;
122 if (*(p - 1) == '-') p--;
123 } while (*p == '-'); // Skip over any switches.
125 strncpy(buf, p, length);
127 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
128 if (rl) language = rl->value;
129 } else if (strstr(line, "xml")) language = LANG_XML;
131 if (language) return language;
137 const char *disambiguate_aspx(SourceFile *sourcefile) {
141 p = ohcount_sourcefile_get_contents(sourcefile);
142 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
144 // /<%@\s*Page[^>]+Language="VB"[^>]+%>/
145 p = strstr(p, "<%@");
146 pe = strstr(p, "%>");
151 strncpy(buf, p, length);
153 for (p = buf; p < buf + strlen(buf); p++)
156 while (*p == ' ' || *p == '\t') p++;
157 if (strncmp(p, "page", 4) == 0) {
159 if (strstr(p, "language=\"vb\""))
168 const char *disambiguate_b(SourceFile *sourcefile) {
169 char *p = ohcount_sourcefile_get_contents(sourcefile);
170 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
172 // /(implement[ \t])|(include[ \t]+"[^"]*";)|
173 // ((return|break|continue).*;|(pick|case).*\{)/
174 if (strncmp(p, "implement", 9) == 0 &&
175 (*(p + 9) == ' ' || *(p + 9) == '\t'))
177 else if (strncmp(p, "include", 7) == 0 &&
178 (*(p + 7) == ' ' || *(p + 7) == '\t')) {
180 while (*p == ' ' || *p == '\t') p++;
182 while (*p != '"' && p < eof) p++;
183 if (*p == '"' && *(p + 1) == ';')
186 } else if (strncmp(p, "return", 6) == 0 ||
187 strncmp(p, "break", 5) == 0 ||
188 strncmp(p, "continue", 8) == 0) {
191 } else if (strncmp(p, "pick", 4) == 0 ||
192 strncmp(p, "case", 4) == 0) {
198 return disambiguate_basic(sourcefile);
201 const char *disambiguate_basic(SourceFile *sourcefile) {
205 // Attempt to detect based on file contents.
207 p = ohcount_sourcefile_get_contents(sourcefile);
209 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
211 // Get a line at a time.
212 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
213 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
214 strncpy(line, p, length);
222 while (isdigit(*p)) p++;
223 if (*p == ' ' || *p == '\t') {
225 while (*p == ' ' || *p == '\t') p++;
227 return LANG_CLASSIC_BASIC;
233 while (*pe == '\r' || *pe == '\n') pe++;
237 // Attempt to detect from associated VB files in file context.
238 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
241 for (i = 0; filenames[i] != NULL; i++) {
242 pe = filenames[i] + strlen(filenames[i]);
244 while (p > filenames[i] && *(p - 1) != '.') p--;
247 (strncmp(p, "frm", length) == 0 ||
248 strncmp(p, "frx", length) == 0 ||
249 strncmp(p, "vba", length) == 0 ||
250 strncmp(p, "vbp", length) == 0 ||
251 strncmp(p, "vbs", length) == 0)) {
252 return LANG_VISUALBASIC;
257 return LANG_STRUCTURED_BASIC;
260 const char *disambiguate_cs(SourceFile *sourcefile) {
261 // Attempt to detect based on file contents.
262 if (strstr(ohcount_sourcefile_get_contents(sourcefile), "<?cs"))
263 return LANG_CLEARSILVER_TEMPLATE;
268 const char *disambiguate_fortran(SourceFile *sourcefile) {
271 p = ohcount_sourcefile_get_contents(sourcefile);
272 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
274 if (*p == ' ' && p + 5 < eof) {
276 for (i = 1; i <= 5; i++)
277 if (!isdigit(*(p + i)) && *(p + i) != ' ')
278 return LANG_FORTRANFIXED; // definately not f77
279 // Possibly fixed (doesn't match /^\s*\d+\s*$/).
281 while (*pe == ' ' || *pe == '\t') pe++;
284 return LANG_FORTRANFIXED;
285 while (isdigit(*pe)) pe++;
286 while (*pe == ' ' || *pe == '\t') pe++;
287 if (*pe != '\r' && *pe != '\n' && pe - p == 5)
288 return LANG_FORTRANFIXED;
291 while (*p != '\r' && *p != '\n' && *p != '&' && p < eof) p++;
294 // Look for free-form continuation.
295 while (*p == ' ' || *p == '\t') p++;
296 if (*p == '\r' || *p == '\n') {
298 while (*pe == '\r' || *pe == '\n' || *pe == ' ' || *pe == '\t') pe++;
300 return LANG_FORTRANFREE;
303 while (*p == '\r' || *p == '\n') p++;
305 return LANG_FORTRANFREE; // might as well be free-form
308 const char *disambiguate_h(SourceFile *sourcefile) {
312 // If the directory contains a matching *.m file, likely Objective C.
313 length = strlen(sourcefile->filename);
314 if (strcmp(sourcefile->ext, "h") == 0) {
316 strncpy(path, sourcefile->filename, length);
318 *(path + length - 1) = 'm';
319 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
322 for (i = 0; filenames[i] != NULL; i++)
323 if (strcmp(path, filenames[i]) == 0)
324 return LANG_OBJECTIVE_C;
328 // Attempt to detect based on file contents.
329 char line[81], buf[81];
330 p = ohcount_sourcefile_get_contents(sourcefile);
332 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
334 // Get a line at a time.
335 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
336 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
337 strncpy(line, p, length);
341 // Look for C++ headers.
344 while (*p == ' ' || *p == '\t') p++;
345 if (strncmp(p, "include", 7) == 0 &&
346 (*(p + 7) == ' ' || *(p + 7) == '\t')) {
347 // /^#\s*include\s+[<"][^>"]+[>"]/
349 while (*p == ' ' || *p == '\t') p++;
350 if (*p == '<' || *p == '"') {
351 // Is the header file a C++ header file?
354 while (pe < p + strlen(line) && *pe != '>' && *pe != '"') pe++;
356 strncpy(buf, p, length);
358 if (ohcount_hash_is_cppheader(buf, length))
360 // Is the extension for the header file a C++ file?
362 while (p > line && *(p - 1) != '.') p--;
364 strncpy(buf, p, length);
366 struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
367 if (re && strcmp(re->value, LANG_CPP) == 0)
373 // Look for C++ keywords.
375 while (p < line + strlen(line) && *p != '\r' && *p != '\n') {
376 if (islower(*p) && !isalnum(*(p - 1)) && *(p - 1) != '_') {
378 while (islower(*pe)) pe++;
379 if (!isalnum(*pe) && *pe != '_') {
381 strncpy(buf, p, length);
383 if (strcmp(buf, "class") == 0 ||
384 strcmp(buf, "namespace") == 0 ||
385 strcmp(buf, "template") == 0 ||
386 strcmp(buf, "typename") == 0)
395 while (*pe == '\r' || *pe == '\n') pe++;
399 // Nothing to suggest C++.
403 const char *disambiguate_in(SourceFile *sourcefile) {
406 const char *language = NULL;
408 p = sourcefile->filepath;
409 pe = p + strlen(p) - 3;
410 if (strstr(p, ".") <= pe) {
411 // Only if the filename has an extension prior to the .in
414 strncpy(buf, p, length);
416 SourceFile *undecorated = ohcount_sourcefile_new(buf);
417 p = ohcount_sourcefile_get_contents(sourcefile);
418 // The filepath without the '.in' extension does not exist on disk. The
419 // sourcefile->diskpath field must be set incase the detector needs to run
420 // 'file -b' on the file.
421 ohcount_sourcefile_set_diskpath(undecorated, sourcefile->filepath);
422 ohcount_sourcefile_set_contents(undecorated, p);
423 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
424 ohcount_sourcefile_set_filenames(undecorated, filenames);
425 language = ohcount_sourcefile_get_language(undecorated);
426 ohcount_sourcefile_free(undecorated);
431 const char *disambiguate_inc(SourceFile *sourcefile) {
432 char *p = ohcount_sourcefile_get_contents(sourcefile);
433 char *eof = p + strlen(p);
437 else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
444 const char *disambiguate_m(SourceFile *sourcefile) {
448 // Attempt to detect based on a weighted heuristic of file contents.
449 int matlab_score = 0;
450 int objective_c_score = 0;
452 int octave_syntax_detected = 0;
454 int i, has_h_headers = 0, has_c_files = 0;
455 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
457 for (i = 0; filenames[i] != NULL; i++) {
461 if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
462 ((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
463 (*(pe - 2) == '+' && *(pe - 1) == '+') ||
464 (*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
466 break; // short circuit
468 } else if (pe - p >= 3) {
469 if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
471 break; // short circuit
473 } else if (pe - p >= 2) {
474 if (*(pe - 2) == '.') {
475 if (*(pe - 1) == 'h')
477 else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
479 break; // short circuit
485 if (has_h_headers && !has_c_files)
486 objective_c_score += 5;
488 char line[81], buf[81];
489 p = ohcount_sourcefile_get_contents(sourcefile);
491 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
493 // Get a line at a time.
494 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
495 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
496 strncpy(line, p, length);
500 // Look for tell-tale lines.
502 while (*p == ' ' || *p == '\t') p++;
503 if (*p == '%') { // Matlab comment
505 } else if (*p == '#') { // Limbo or Octave comment
506 while (*p == '#') p++;
507 if (*p == ' ' || *p == '\t') {
510 octave_syntax_detected = 1;
512 } else if (*p == '/' && *(p + 1) == '/' || *(p + 1) == '*') {
513 objective_c_score++; // Objective C comment
514 } else if (*p == '+' || *p == '-') { // Objective C method signature
516 } else if (*p == '@') { // Objective C method signature
517 if (strncmp(p, "@implementation", 15) == 0 ||
518 strncmp(p, "@interface", 10) == 0)
520 } else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
522 while (*p == ' ' || *p == '\t') p++;
525 } else if (strncmp(p, "include", 7) == 0) { // Limbo include
526 // /^include[ \t]+"[^"]+\.m";/
528 if (*p == ' ' || *p == '\t') {
529 while (*p == ' ' || *p == '\t') p++;
531 while (*p != '"' && p < line + strlen(line)) p++;
532 if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
538 // Look for Octave keywords.
540 while (p < line + strlen(line)) {
541 if (islower(*p) && !isalnum(*(p - 1))) {
543 while (islower(*pe) || *pe == '_') pe++;
546 strncpy(buf, p, length);
548 if (strcmp(buf, "end_try_catch") == 0 ||
549 strcmp(buf, "end_unwind_protect") == 0 ||
550 strcmp(buf, "endfunction") == 0 ||
551 strcmp(buf, "endwhile") == 0)
552 octave_syntax_detected = 1;
558 // Look for Limbo declarations
560 while (p < line + strlen(line)) {
561 if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
562 // /:[ \t]+(module|adt|fn ?\(|con[ \t])/
564 if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
565 strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
566 strncmp(p, "fn", 2) == 0 &&
567 (*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
568 strncmp(p, "con", 3) == 0 &&
569 (*(p + 3) == ' ' || *(p + 3) == '\t'))
576 while (*pe == '\r' || *pe == '\n') pe++;
580 if (limbo_score > objective_c_score && limbo_score > matlab_score)
582 else if (objective_c_score > matlab_score)
583 return LANG_OBJECTIVE_C;
585 return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
588 const char *disambiguate_st(SourceFile *sourcefile) {
592 // Attempt to detect based on file contents.
593 int found_assignment = 0, found_block_start = 0, found_block_end = 0;
596 p = ohcount_sourcefile_get_contents(sourcefile);
598 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
600 // Get a line at a time.
601 while (p < eof && *pe != '\r' && *pe != '\n') pe++;
602 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
603 strncpy(line, p, length);
607 for (p = line; p < line + strlen(line); p++) {
610 while (p < line + strlen(line) && (*p == ' ' || *p == '\t')) p++;
612 found_assignment = 1;
614 found_block_start = 1;
615 } else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
616 if (found_assignment && found_block_start && found_block_end)
617 return LANG_SMALLTALK;
622 while (*pe == '\r' || *pe == '\n') pe++;