1 // detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
2 // See COPYING for license information.
11 #include "languages.h"
14 #include "hash/cppheader_hash.h"
15 #include "hash/disambiguatefunc_hash.h"
16 #include "hash/extension_hash.h"
17 #include "hash/filename_hash.h"
19 #define ISBINARY(x) (x[0] == '\1')
20 #define ISAMBIGUOUS(x) (x[0] == '\2')
21 #define DISAMBIGUATEWHAT(x) &x[1]
23 const char *ohcount_detect_language(SourceFile *sourcefile) {
24 const char *language = NULL;
28 // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
29 char line[81] = { '\0' }, buf[81];
30 p = ohcount_sourcefile_get_contents(sourcefile);
32 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
34 // Get the contents of the first line.
35 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
36 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
37 strncpy(line, p, length);
39 if (*line == '#' && *(line + 1) == '!') {
40 // First line was sh-bang; loop to get contents of second line.
41 while (*pe == '\r' || *pe == '\n') pe++;
45 p = strstr(line, "-*-");
48 while (*p == ' ' || *p == '\t') p++;
49 // detect "mode" (any capitalization)
50 if (strncasecmp(p, "mode", 4) == 0) {
52 while (*p == ' ' || *p == '\t' || *p == ':') p++;
55 while (!isspace(*pe) && *pe != ';' && pe != strstr(pe, "-*-")) pe++;
57 strncpy(buf, p, length);
59 // First try it with the language name.
60 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
61 if (rl) language = rl->name;
63 // Then try it with the extension table.
64 struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
65 if (re) language = re->value;
68 // Try the lower-case version of this modeline.
69 for (pe = buf; pe < buf+length; pe++) *pe = tolower(*pe);
70 // First try it with the language name.
71 rl = ohcount_hash_language_from_name(buf, length);
72 if (rl) language = rl->name;
75 // Then try it with the extension table.
76 struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
77 if (re) language = re->value;
81 // Attempt to detect based on file extension.
83 length = strlen(sourcefile->ext);
84 struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
86 if (re) language = re->value;
88 // Try the lower-case version of this extension.
89 char lowerext[length + 1];
90 strncpy(lowerext, sourcefile->ext, length);
91 lowerext[length] = '\0';
92 for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
93 struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
94 if (re) language = re->value;
98 // Attempt to detect based on filename.
100 length = strlen(sourcefile->filename);
101 struct FilenameMap *rf =
102 ohcount_hash_language_from_filename(sourcefile->filename, length);
103 if (rf) language = rf->value;
106 // Attempt to detect based on Unix 'file' command.
109 char *path = sourcefile->filepath;
110 if (sourcefile->diskpath)
111 path = sourcefile->diskpath;
112 if (access(path, F_OK) != 0) { // create temporary file
114 strncpy(path, "/tmp/ohcount_XXXXXXX", 20);
116 int fd = mkstemp(path);
117 char *contents = ohcount_sourcefile_get_contents(sourcefile);
120 length = contents ? strlen(contents) : 0;
121 write(fd, contents, length);
125 char command[strlen(path) + 11];
126 sprintf(command, "file -b '%s'", path);
127 FILE *f = popen(command, "r");
129 fgets(line, sizeof(line), f);
130 char *eol = line + strlen(line);
131 for (p = line; p < eol; p++) *p = tolower(*p);
132 p = strstr(line, "script text");
133 if (p && p == line) { // /^script text(?: executable)? for \w/
134 p = strstr(line, "for ");
138 while (isalnum(*pe)) pe++;
140 strncpy(buf, p, length);
142 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
143 if (rl) language = rl->name;
145 } else if (p) { // /(\w+)(?: -\w+)* script text/
149 while (*p == ' ') p--;
150 while (p != line && isalnum(*(p - 1))) p--;
151 if (p != line && *(p - 1) == '-') p--;
152 } while (*p == '-'); // Skip over any switches.
154 strncpy(buf, p, length);
156 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
157 if (rl) language = rl->name;
158 } else if (strstr(line, "xml")) language = LANG_XML;
167 if (ISAMBIGUOUS(language)) {
168 // Call the appropriate function for disambiguation.
169 length = strlen(DISAMBIGUATEWHAT(language));
170 struct DisambiguateFuncsMap *rd =
171 ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
173 if (rd) language = rd->value(sourcefile);
174 } else language = ISBINARY(language) ? NULL : language;
179 const char *disambiguate_aspx(SourceFile *sourcefile) {
180 char *p = ohcount_sourcefile_get_contents(sourcefile);
181 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
182 for (; p < eof; p++) {
183 // /<%@\s*Page[^>]+Language="VB"[^>]+%>/
184 p = strstr(p, "<%@");
187 char *pe = strstr(p, "%>");
190 const int length = pe - p;
192 strncpy(buf, p, length);
194 char *eol = buf + strlen(buf);
195 for (p = buf; p < eol; p++) *p = tolower(*p);
197 while (*p == ' ' || *p == '\t') p++;
198 if (strncmp(p, "page", 4) == 0) {
200 if (strstr(p, "language=\"vb\""))
208 const char *disambiguate_b(SourceFile *sourcefile) {
209 char *p = ohcount_sourcefile_get_contents(sourcefile);
210 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
212 // /(implement[ \t])|(include[ \t]+"[^"]*";)|
213 // ((return|break|continue).*;|(pick|case).*\{)/
214 if (strncmp(p, "implement", 9) == 0 &&
215 (*(p + 9) == ' ' || *(p + 9) == '\t'))
217 else if (strncmp(p, "include", 7) == 0 &&
218 (*(p + 7) == ' ' || *(p + 7) == '\t')) {
220 while (*p == ' ' || *p == '\t') p++;
222 while (*p != '"' && p < eof) p++;
223 if (*p == '"' && *(p + 1) == ';')
226 } else if (strncmp(p, "return", 6) == 0 ||
227 strncmp(p, "break", 5) == 0 ||
228 strncmp(p, "continue", 8) == 0) {
231 } else if (strncmp(p, "pick", 4) == 0 ||
232 strncmp(p, "case", 4) == 0) {
238 return disambiguate_basic(sourcefile);
241 const char *disambiguate_basic(SourceFile *sourcefile) {
245 // Attempt to detect based on file contents.
247 p = ohcount_sourcefile_get_contents(sourcefile);
249 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
251 // Get a line at a time.
252 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
253 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
254 strncpy(line, p, length);
262 while (isdigit(*p)) p++;
263 if (*p == ' ' || *p == '\t') {
265 while (*p == ' ' || *p == '\t') p++;
267 return LANG_CLASSIC_BASIC;
273 while (*pe == '\r' || *pe == '\n') pe++;
277 // Attempt to detect from associated VB files in file context.
278 char **filenames = sourcefile->filenames;
281 for (i = 0; filenames[i] != NULL; i++) {
282 pe = filenames[i] + strlen(filenames[i]);
284 while (p > filenames[i] && *(p - 1) != '.') p--;
287 (strncmp(p, "frm", length) == 0 ||
288 strncmp(p, "frx", length) == 0 ||
289 strncmp(p, "vba", length) == 0 ||
290 strncmp(p, "vbp", length) == 0 ||
291 strncmp(p, "vbs", length) == 0)) {
292 return LANG_VISUALBASIC;
297 return LANG_STRUCTURED_BASIC;
300 const char *disambiguate_cs(SourceFile *sourcefile) {
301 // Attempt to detect based on file contents.
302 char *contents = ohcount_sourcefile_get_contents(sourcefile);
303 if (contents && strstr(contents, "<?cs"))
304 return LANG_CLEARSILVER_TEMPLATE;
309 const char *disambiguate_fortran(SourceFile *sourcefile) {
312 p = ohcount_sourcefile_get_contents(sourcefile);
313 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
315 if (*p == ' ' && p + 5 < eof) {
317 for (i = 1; i <= 5; i++)
318 if (!isdigit(*(p + i)) && *(p + i) != ' ')
319 return LANG_FORTRANFIXED; // definately not f77
320 // Possibly fixed (doesn't match /^\s*\d+\s*$/).
322 while (*pe == ' ' || *pe == '\t') pe++;
325 return LANG_FORTRANFIXED;
326 while (isdigit(*pe)) pe++;
327 while (*pe == ' ' || *pe == '\t') pe++;
328 if (*pe != '\r' && *pe != '\n' && pe - p == 5)
329 return LANG_FORTRANFIXED;
332 while (*p != '\r' && *p != '\n' && *p != '&' && p < eof) p++;
335 // Look for free-form continuation.
336 while (*p == ' ' || *p == '\t') p++;
337 if (*p == '\r' || *p == '\n') {
339 while (*pe == '\r' || *pe == '\n' || *pe == ' ' || *pe == '\t') pe++;
341 return LANG_FORTRANFREE;
344 while (*p == '\r' || *p == '\n') p++;
346 return LANG_FORTRANFREE; // might as well be free-form
349 const char *disambiguate_h(SourceFile *sourcefile) {
353 // If the directory contains a matching *.m file, likely Objective C.
354 length = strlen(sourcefile->filename);
355 if (strcmp(sourcefile->ext, "h") == 0) {
357 strncpy(path, sourcefile->filename, length);
359 *(path + length - 1) = 'm';
360 char **filenames = sourcefile->filenames;
363 for (i = 0; filenames[i] != NULL; i++)
364 if (strcmp(path, filenames[i]) == 0)
365 return LANG_OBJECTIVE_C;
369 // Attempt to detect based on file contents.
370 char line[81], buf[81];
371 bof = ohcount_sourcefile_get_contents(sourcefile);
374 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
376 // Get a line at a time.
377 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
378 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
379 strncpy(line, p, length);
381 char *eol = line + strlen(line);
384 // Look for C++ headers.
387 while (*p == ' ' || *p == '\t') p++;
388 if (strncmp(p, "include", 7) == 0 &&
389 (*(p + 7) == ' ' || *(p + 7) == '\t')) {
390 // /^#\s*include\s+[<"][^>"]+[>"]/
392 while (*p == ' ' || *p == '\t') p++;
393 if (*p == '<' || *p == '"') {
394 // Is the header file a C++ header file?
397 while (pe < eol && *pe != '>' && *pe != '"') pe++;
399 strncpy(buf, p, length);
401 if (ohcount_hash_is_cppheader(buf, length))
403 // Is the extension for the header file a C++ file?
405 while (p > line && *(p - 1) != '.') p--;
407 strncpy(buf, p, length);
409 struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
410 if (re && strcmp(re->value, LANG_CPP) == 0)
416 // Look for C++ keywords.
419 if (islower(*p) && p != bof && !isalnum(*(p - 1)) && *(p - 1) != '_') {
421 while (islower(*pe)) pe++;
422 if (!isalnum(*pe) && *pe != '_') {
424 strncpy(buf, p, length);
426 if (strcmp(buf, "class") == 0 ||
427 strcmp(buf, "namespace") == 0 ||
428 strcmp(buf, "template") == 0 ||
429 strcmp(buf, "typename") == 0)
438 while (*pe == '\r' || *pe == '\n') pe++;
442 // Nothing to suggest C++.
446 const char *disambiguate_in(SourceFile *sourcefile) {
449 const char *language = NULL;
451 p = sourcefile->filepath;
452 pe = p + strlen(p) - 3;
453 if (strstr(p, ".") <= pe) {
454 // Only if the filename has an extension prior to the .in
457 strncpy(buf, p, length);
459 SourceFile *undecorated = ohcount_sourcefile_new(buf);
460 p = ohcount_sourcefile_get_contents(sourcefile);
464 // The filepath without the '.in' extension does not exist on disk. The
465 // sourcefile->diskpath field must be set incase the detector needs to run
466 // 'file -b' on the file.
467 ohcount_sourcefile_set_diskpath(undecorated, sourcefile->filepath);
468 ohcount_sourcefile_set_contents(undecorated, p);
469 undecorated->filenames = sourcefile->filenames;
470 language = ohcount_sourcefile_get_language(undecorated);
471 ohcount_sourcefile_free(undecorated);
476 const char *disambiguate_inc(SourceFile *sourcefile) {
477 char *p = ohcount_sourcefile_get_contents(sourcefile);
479 char *eof = p + strlen(p);
483 else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
491 const char *disambiguate_m(SourceFile *sourcefile) {
495 // Attempt to detect based on a weighted heuristic of file contents.
496 int matlab_score = 0;
497 int objective_c_score = 0;
499 int octave_syntax_detected = 0;
501 int i, has_h_headers = 0, has_c_files = 0;
502 char **filenames = sourcefile->filenames;
504 for (i = 0; filenames[i] != NULL; i++) {
508 if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
509 ((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
510 (*(pe - 2) == '+' && *(pe - 1) == '+') ||
511 (*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
513 break; // short circuit
515 } else if (pe - p >= 3) {
516 if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
518 break; // short circuit
520 } else if (pe - p >= 2) {
521 if (*(pe - 2) == '.') {
522 if (*(pe - 1) == 'h')
524 else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
526 break; // short circuit
532 if (has_h_headers && !has_c_files)
533 objective_c_score += 5;
535 char line[81], buf[81];
536 p = ohcount_sourcefile_get_contents(sourcefile);
538 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
540 // Get a line at a time.
541 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
542 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
543 strncpy(line, p, length);
545 char *eol = line + strlen(line);
548 // Look for tell-tale lines.
550 while (*p == ' ' || *p == '\t') p++;
551 if (*p == '%') { // Matlab comment
553 } else if (*p == '#' && strncmp(p, "#import", 7) == 0) { // Objective C
555 } else if (*p == '#') { // Limbo or Octave comment
556 while (*p == '#') p++;
557 if (*p == ' ' || *p == '\t') {
560 octave_syntax_detected = 1;
562 } else if (*p == '/' && *(p + 1) == '/' || *(p + 1) == '*') {
563 objective_c_score++; // Objective C comment
564 } else if (*p == '+' || *p == '-') { // Objective C method signature
566 } else if (*p == '@' || *p == '#') { // Objective C method signature
567 if (strncmp(p, "@implementation", 15) == 0 ||
568 strncmp(p, "@interface", 10) == 0)
570 } else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
572 while (*p == ' ' || *p == '\t') p++;
575 } else if (strncmp(p, "include", 7) == 0) { // Limbo include
576 // /^include[ \t]+"[^"]+\.m";/
578 if (*p == ' ' || *p == '\t') {
579 while (*p == ' ' || *p == '\t') p++;
581 while (*p != '"' && p < eol) p++;
582 if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
588 // Look for Octave keywords.
591 if (islower(*p) && p != line && !isalnum(*(p - 1))) {
593 while (islower(*pe) || *pe == '_') pe++;
596 strncpy(buf, p, length);
598 if (strcmp(buf, "end_try_catch") == 0 ||
599 strcmp(buf, "end_unwind_protect") == 0 ||
600 strcmp(buf, "endfunction") == 0 ||
601 strcmp(buf, "endwhile") == 0)
602 octave_syntax_detected = 1;
608 // Look for Limbo declarations
611 if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
612 // /:[ \t]+(module|adt|fn ?\(|con[ \t])/
614 if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
615 strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
616 strncmp(p, "fn", 2) == 0 &&
617 (*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
618 strncmp(p, "con", 3) == 0 &&
619 (*(p + 3) == ' ' || *(p + 3) == '\t'))
626 while (*pe == '\r' || *pe == '\n') pe++;
630 if (limbo_score > objective_c_score && limbo_score > matlab_score)
632 else if (objective_c_score > matlab_score)
633 return LANG_OBJECTIVE_C;
635 return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
638 #define QMAKE_SOURCES_SPACE "SOURCES +="
639 #define QMAKE_SOURCES "SOURCES+="
640 #define QMAKE_CONFIG_SPACE "CONFIG +="
641 #define QMAKE_CONFIG "CONFIG+="
643 const char *disambiguate_pro(SourceFile *sourcefile) {
644 char *p = ohcount_sourcefile_get_contents(sourcefile);
645 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
646 for (; p < eof; p++) {
647 if (strncmp(p, QMAKE_SOURCES_SPACE, strlen(QMAKE_SOURCES_SPACE)) == 0 ||
648 strncmp(p, QMAKE_SOURCES, strlen(QMAKE_SOURCES)) == 0 ||
649 strncmp(p, QMAKE_CONFIG_SPACE, strlen(QMAKE_CONFIG_SPACE)) == 0 ||
650 strncmp(p, QMAKE_CONFIG, strlen(QMAKE_CONFIG)) == 0)
651 return LANG_MAKE; // really QMAKE
653 return LANG_IDL_PVWAVE;
656 const char *disambiguate_st(SourceFile *sourcefile) {
660 // Attempt to detect based on file contents.
661 int found_assignment = 0, found_block_start = 0, found_block_end = 0;
664 p = ohcount_sourcefile_get_contents(sourcefile);
666 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
668 // Get a line at a time.
669 while (p < eof && *pe != '\r' && *pe != '\n') pe++;
670 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
671 strncpy(line, p, length);
673 char *eol = line + strlen(line);
676 for (p = line; p < eol; p++) {
679 while (p < eol && (*p == ' ' || *p == '\t')) p++;
681 found_assignment = 1;
683 found_block_start = 1;
684 } else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
685 if (found_assignment && found_block_start && found_block_end)
686 return LANG_SMALLTALK;
691 while (*pe == '\r' || *pe == '\n') pe++;
698 int ohcount_is_binary_filename(const char *filename) {
699 char *p = (char *)filename + strlen(filename);
700 while (p > filename && *(p - 1) != '.') p--;
702 struct ExtensionMap *re;
703 int length = strlen(p);
704 re = ohcount_hash_language_from_ext(p, length);
705 if (re) return ISBINARY(re->value);
706 // Try the lower-case version of this extension.
707 char lowerext[length];
708 strncpy(lowerext, p, length);
709 lowerext[length] = '\0';
710 for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
711 re = ohcount_hash_language_from_ext(lowerext, length);
712 if (re) return ISBINARY(re->value);