1 // detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
2 // See COPYING for license information.
10 #include "languages.h"
12 #include "hash/cppheader_hash.h"
13 #include "hash/disambiguatefunc_hash.h"
14 #include "hash/extension_hash.h"
15 #include "hash/filename_hash.h"
17 #define ISBINARY(x) (x[0] == '\1')
18 #define ISAMBIGUOUS(x) (x[0] == '\2')
19 #define DISAMBIGUATEWHAT(x) &x[1]
21 const char *ohcount_detect_language(SourceFile *sourcefile) {
22 const char *language = NULL;
26 // Attempt to detect based on file extension.
27 length = strlen(sourcefile->ext);
28 struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
30 if (re) language = re->value;
31 if (language == NULL) {
32 // Try the lower-case version of this extension.
33 char lowerext[length];
34 strncpy(lowerext, sourcefile->ext, length);
35 lowerext[length] = '\0';
36 for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
37 struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
38 if (re) return re->value;
41 if (ISAMBIGUOUS(language)) {
42 // Call the appropriate function for disambiguation.
43 length = strlen(DISAMBIGUATEWHAT(language));
44 struct DisambiguateFuncsMap *rd =
45 ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
47 if (rd) return rd->value(sourcefile);
48 } else return ISBINARY(language) ? NULL : language;
51 // Attempt to detect based on filename.
52 length = strlen(sourcefile->filename);
53 struct FilenameMap *rf =
54 ohcount_hash_language_from_filename(sourcefile->filename, length);
55 if (rf) return rf->value;
57 char line[81], buf[81];
59 // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
60 p = ohcount_sourcefile_get_contents(sourcefile);
62 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
64 // Get the contents of the first line.
65 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
66 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
67 strncpy(line, p, length);
69 if (*line == '#' && *(line + 1) == '!') {
70 // First line was sh-bang; loop to get contents of second line.
71 while (*pe == '\r' || *pe == '\n') pe++;
75 char *eol = line + strlen(line);
76 for (p = line; p < eol; p++) *p = tolower(*p);
77 p = strstr(line, "-*-");
80 while (*p == ' ' || *p == '\t') p++;
81 if (strncmp(p, "mode", 4) == 0) {
83 while (*p == ' ' || *p == '\t' || *p == ':') p++;
86 while (isalnum(*pe)) pe++;
88 strncpy(buf, p, length);
90 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
91 if (rl) return rl->name;
94 // Attempt to detect based on Unix 'file' command.
95 char *path = sourcefile->filepath;
96 if (sourcefile->diskpath)
97 path = sourcefile->diskpath;
98 char command[strlen(path) + 10];
99 sprintf(command, "file -b '%s'", path);
100 FILE *f = popen(command, "r");
102 fgets(line, sizeof(line), f);
103 char *eol = line + strlen(line);
104 for (p = line; p < eol; p++) *p = tolower(*p);
105 p = strstr(line, "script text");
106 if (p && p == line) { // /^script text(?: executable)? for \w/
107 p = strstr(line, "for ");
111 while (isalnum(*pe)) pe++;
113 strncpy(buf, p, length);
115 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
116 if (rl) language = rl->name;
118 } else if (p) { // /(\w+)(?: -\w+)* script text/
122 while (*p == ' ') p--;
123 while (p != line && isalnum(*(p - 1))) p--;
124 if (*(p - 1) == '-') p--;
125 } while (*p == '-'); // Skip over any switches.
127 strncpy(buf, p, length);
129 struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
130 if (rl) language = rl->name;
131 } else if (strstr(line, "xml")) language = LANG_XML;
133 if (language) return language;
139 const char *disambiguate_aspx(SourceFile *sourcefile) {
143 p = ohcount_sourcefile_get_contents(sourcefile);
144 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
146 // /<%@\s*Page[^>]+Language="VB"[^>]+%>/
147 p = strstr(p, "<%@");
148 pe = strstr(p, "%>");
153 strncpy(buf, p, length);
155 char *eol = buf + strlen(buf);
156 for (p = buf; p < eol; p++) *p = tolower(*p);
158 while (*p == ' ' || *p == '\t') p++;
159 if (strncmp(p, "page", 4) == 0) {
161 if (strstr(p, "language=\"vb\""))
170 const char *disambiguate_b(SourceFile *sourcefile) {
171 char *p = ohcount_sourcefile_get_contents(sourcefile);
172 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
174 // /(implement[ \t])|(include[ \t]+"[^"]*";)|
175 // ((return|break|continue).*;|(pick|case).*\{)/
176 if (strncmp(p, "implement", 9) == 0 &&
177 (*(p + 9) == ' ' || *(p + 9) == '\t'))
179 else if (strncmp(p, "include", 7) == 0 &&
180 (*(p + 7) == ' ' || *(p + 7) == '\t')) {
182 while (*p == ' ' || *p == '\t') p++;
184 while (*p != '"' && p < eof) p++;
185 if (*p == '"' && *(p + 1) == ';')
188 } else if (strncmp(p, "return", 6) == 0 ||
189 strncmp(p, "break", 5) == 0 ||
190 strncmp(p, "continue", 8) == 0) {
193 } else if (strncmp(p, "pick", 4) == 0 ||
194 strncmp(p, "case", 4) == 0) {
200 return disambiguate_basic(sourcefile);
203 const char *disambiguate_basic(SourceFile *sourcefile) {
207 // Attempt to detect based on file contents.
209 p = ohcount_sourcefile_get_contents(sourcefile);
211 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
213 // Get a line at a time.
214 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
215 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
216 strncpy(line, p, length);
224 while (isdigit(*p)) p++;
225 if (*p == ' ' || *p == '\t') {
227 while (*p == ' ' || *p == '\t') p++;
229 return LANG_CLASSIC_BASIC;
235 while (*pe == '\r' || *pe == '\n') pe++;
239 // Attempt to detect from associated VB files in file context.
240 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
243 for (i = 0; filenames[i] != NULL; i++) {
244 pe = filenames[i] + strlen(filenames[i]);
246 while (p > filenames[i] && *(p - 1) != '.') p--;
249 (strncmp(p, "frm", length) == 0 ||
250 strncmp(p, "frx", length) == 0 ||
251 strncmp(p, "vba", length) == 0 ||
252 strncmp(p, "vbp", length) == 0 ||
253 strncmp(p, "vbs", length) == 0)) {
254 return LANG_VISUALBASIC;
259 return LANG_STRUCTURED_BASIC;
262 const char *disambiguate_cs(SourceFile *sourcefile) {
263 // Attempt to detect based on file contents.
264 if (strstr(ohcount_sourcefile_get_contents(sourcefile), "<?cs"))
265 return LANG_CLEARSILVER_TEMPLATE;
270 const char *disambiguate_fortran(SourceFile *sourcefile) {
273 p = ohcount_sourcefile_get_contents(sourcefile);
274 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
276 if (*p == ' ' && p + 5 < eof) {
278 for (i = 1; i <= 5; i++)
279 if (!isdigit(*(p + i)) && *(p + i) != ' ')
280 return LANG_FORTRANFIXED; // definately not f77
281 // Possibly fixed (doesn't match /^\s*\d+\s*$/).
283 while (*pe == ' ' || *pe == '\t') pe++;
286 return LANG_FORTRANFIXED;
287 while (isdigit(*pe)) pe++;
288 while (*pe == ' ' || *pe == '\t') pe++;
289 if (*pe != '\r' && *pe != '\n' && pe - p == 5)
290 return LANG_FORTRANFIXED;
293 while (*p != '\r' && *p != '\n' && *p != '&' && p < eof) p++;
296 // Look for free-form continuation.
297 while (*p == ' ' || *p == '\t') p++;
298 if (*p == '\r' || *p == '\n') {
300 while (*pe == '\r' || *pe == '\n' || *pe == ' ' || *pe == '\t') pe++;
302 return LANG_FORTRANFREE;
305 while (*p == '\r' || *p == '\n') p++;
307 return LANG_FORTRANFREE; // might as well be free-form
310 const char *disambiguate_h(SourceFile *sourcefile) {
314 // If the directory contains a matching *.m file, likely Objective C.
315 length = strlen(sourcefile->filename);
316 if (strcmp(sourcefile->ext, "h") == 0) {
318 strncpy(path, sourcefile->filename, length);
320 *(path + length - 1) = 'm';
321 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
324 for (i = 0; filenames[i] != NULL; i++)
325 if (strcmp(path, filenames[i]) == 0)
326 return LANG_OBJECTIVE_C;
330 // Attempt to detect based on file contents.
331 char line[81], buf[81];
332 p = ohcount_sourcefile_get_contents(sourcefile);
334 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
336 // Get a line at a time.
337 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
338 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
339 strncpy(line, p, length);
341 char *eol = line + strlen(line);
344 // Look for C++ headers.
347 while (*p == ' ' || *p == '\t') p++;
348 if (strncmp(p, "include", 7) == 0 &&
349 (*(p + 7) == ' ' || *(p + 7) == '\t')) {
350 // /^#\s*include\s+[<"][^>"]+[>"]/
352 while (*p == ' ' || *p == '\t') p++;
353 if (*p == '<' || *p == '"') {
354 // Is the header file a C++ header file?
357 while (pe < eol && *pe != '>' && *pe != '"') pe++;
359 strncpy(buf, p, length);
361 if (ohcount_hash_is_cppheader(buf, length))
363 // Is the extension for the header file a C++ file?
365 while (p > line && *(p - 1) != '.') p--;
367 strncpy(buf, p, length);
369 struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
370 if (re && strcmp(re->value, LANG_CPP) == 0)
376 // Look for C++ keywords.
379 if (islower(*p) && !isalnum(*(p - 1)) && *(p - 1) != '_') {
381 while (islower(*pe)) pe++;
382 if (!isalnum(*pe) && *pe != '_') {
384 strncpy(buf, p, length);
386 if (strcmp(buf, "class") == 0 ||
387 strcmp(buf, "namespace") == 0 ||
388 strcmp(buf, "template") == 0 ||
389 strcmp(buf, "typename") == 0)
398 while (*pe == '\r' || *pe == '\n') pe++;
402 // Nothing to suggest C++.
406 const char *disambiguate_in(SourceFile *sourcefile) {
409 const char *language = NULL;
411 p = sourcefile->filepath;
412 pe = p + strlen(p) - 3;
413 if (strstr(p, ".") <= pe) {
414 // Only if the filename has an extension prior to the .in
417 strncpy(buf, p, length);
419 SourceFile *undecorated = ohcount_sourcefile_new(buf);
420 p = ohcount_sourcefile_get_contents(sourcefile);
421 // The filepath without the '.in' extension does not exist on disk. The
422 // sourcefile->diskpath field must be set incase the detector needs to run
423 // 'file -b' on the file.
424 ohcount_sourcefile_set_diskpath(undecorated, sourcefile->filepath);
425 ohcount_sourcefile_set_contents(undecorated, p);
426 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
427 ohcount_sourcefile_set_filenames(undecorated, filenames);
428 language = ohcount_sourcefile_get_language(undecorated);
429 ohcount_sourcefile_free(undecorated);
434 const char *disambiguate_inc(SourceFile *sourcefile) {
435 char *p = ohcount_sourcefile_get_contents(sourcefile);
436 char *eof = p + strlen(p);
440 else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
447 const char *disambiguate_m(SourceFile *sourcefile) {
451 // Attempt to detect based on a weighted heuristic of file contents.
452 int matlab_score = 0;
453 int objective_c_score = 0;
455 int octave_syntax_detected = 0;
457 int i, has_h_headers = 0, has_c_files = 0;
458 char **filenames = ohcount_sourcefile_get_filenames(sourcefile);
460 for (i = 0; filenames[i] != NULL; i++) {
464 if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
465 ((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
466 (*(pe - 2) == '+' && *(pe - 1) == '+') ||
467 (*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
469 break; // short circuit
471 } else if (pe - p >= 3) {
472 if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
474 break; // short circuit
476 } else if (pe - p >= 2) {
477 if (*(pe - 2) == '.') {
478 if (*(pe - 1) == 'h')
480 else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
482 break; // short circuit
488 if (has_h_headers && !has_c_files)
489 objective_c_score += 5;
491 char line[81], buf[81];
492 p = ohcount_sourcefile_get_contents(sourcefile);
494 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
496 // Get a line at a time.
497 while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
498 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
499 strncpy(line, p, length);
501 char *eol = line + strlen(line);
504 // Look for tell-tale lines.
506 while (*p == ' ' || *p == '\t') p++;
507 if (*p == '%') { // Matlab comment
509 } else if (*p == '#') { // Limbo or Octave comment
510 while (*p == '#') p++;
511 if (*p == ' ' || *p == '\t') {
514 octave_syntax_detected = 1;
516 } else if (*p == '/' && *(p + 1) == '/' || *(p + 1) == '*') {
517 objective_c_score++; // Objective C comment
518 } else if (*p == '+' || *p == '-') { // Objective C method signature
520 } else if (*p == '@') { // Objective C method signature
521 if (strncmp(p, "@implementation", 15) == 0 ||
522 strncmp(p, "@interface", 10) == 0)
524 } else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
526 while (*p == ' ' || *p == '\t') p++;
529 } else if (strncmp(p, "include", 7) == 0) { // Limbo include
530 // /^include[ \t]+"[^"]+\.m";/
532 if (*p == ' ' || *p == '\t') {
533 while (*p == ' ' || *p == '\t') p++;
535 while (*p != '"' && p < eol) p++;
536 if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
542 // Look for Octave keywords.
545 if (islower(*p) && !isalnum(*(p - 1))) {
547 while (islower(*pe) || *pe == '_') pe++;
550 strncpy(buf, p, length);
552 if (strcmp(buf, "end_try_catch") == 0 ||
553 strcmp(buf, "end_unwind_protect") == 0 ||
554 strcmp(buf, "endfunction") == 0 ||
555 strcmp(buf, "endwhile") == 0)
556 octave_syntax_detected = 1;
562 // Look for Limbo declarations
565 if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
566 // /:[ \t]+(module|adt|fn ?\(|con[ \t])/
568 if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
569 strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
570 strncmp(p, "fn", 2) == 0 &&
571 (*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
572 strncmp(p, "con", 3) == 0 &&
573 (*(p + 3) == ' ' || *(p + 3) == '\t'))
580 while (*pe == '\r' || *pe == '\n') pe++;
584 if (limbo_score > objective_c_score && limbo_score > matlab_score)
586 else if (objective_c_score > matlab_score)
587 return LANG_OBJECTIVE_C;
589 return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
592 const char *disambiguate_st(SourceFile *sourcefile) {
596 // Attempt to detect based on file contents.
597 int found_assignment = 0, found_block_start = 0, found_block_end = 0;
600 p = ohcount_sourcefile_get_contents(sourcefile);
602 char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
604 // Get a line at a time.
605 while (p < eof && *pe != '\r' && *pe != '\n') pe++;
606 length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
607 strncpy(line, p, length);
609 char *eol = line + strlen(line);
612 for (p = line; p < eol; p++) {
615 while (p < eol && (*p == ' ' || *p == '\t')) p++;
617 found_assignment = 1;
619 found_block_start = 1;
620 } else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
621 if (found_assignment && found_block_start && found_block_end)
622 return LANG_SMALLTALK;
627 while (*pe == '\r' || *pe == '\n') pe++;
634 int ohcount_is_binary_filename(const char *filename) {
635 char *p = (char *)filename + strlen(filename);
636 while (p > filename && *(p - 1) != '.') p--;
638 struct ExtensionMap *re;
639 int length = strlen(p);
640 re = ohcount_hash_language_from_ext(p, length);
641 if (re) return ISBINARY(re->value);
642 // Try the lower-case version of this extension.
643 char lowerext[length];
644 strncpy(lowerext, p, length);
645 lowerext[length] = '\0';
646 for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
647 re = ohcount_hash_language_from_ext(lowerext, length);
648 if (re) return ISBINARY(re->value);