git.oblomov.eu Git - ohcount/blob - src/detector.c

   1 // detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
   2 // See COPYING for license information.
   3
   4 #include <ctype.h>
   5 #include <magic.h>
   6 #include <stdio.h>
   7 #include <stdlib.h>
   8 #include <string.h>
   9 #include <unistd.h>
  10
  11 #include "detector.h"
  12 #include "languages.h"
  13 #include "log.h"
  14
  15 #include "hash/cppheader_hash.h"
  16 #include "hash/disambiguatefunc_hash.h"
  17 #include "hash/extension_hash.h"
  18 #include "hash/filename_hash.h"
  19
  20 #define ISBINARY(x) (x[0] == '\1')
  21 #define ISAMBIGUOUS(x) (x[0] == '\2')
  22 #define DISAMBIGUATEWHAT(x) &x[1]
  23
  24 #ifdef _WIN32
  25 # include <fcntl.h>
  26 # define mkstemp(p) _open(_mktemp(p), _O_CREAT | _O_SHORT_LIVED | _O_EXCL)
  27 #endif
  28
  29 /* Parse the output of libmagic and return a language, if any.
  30  * The contents of string `line` will be destroyed.
  31  */
  32 const char *magic_parse(char *line) {
  33   char *p, *pe;
  34   char *eol = line + strlen(line);
  35
  36   char buf[80];
  37   size_t length;
  38
  39   for (p = line; p < eol; p++) *p = tolower(*p);
  40   p = strstr(line, "script text"); // Example: "script text executable for perl -w,"
  41   if (p && p == line) {
  42     p = strstr(line, "for ");
  43     if (p) {
  44       p += 4;
  45       pe = p;
  46       while (isalnum(*pe)) pe++;
  47       length = pe - p;
  48       strncpy(buf, p, length);
  49       buf[length] = '\0';
  50       struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
  51       if (rl) return(rl->name);
  52     }
  53   }
  54
  55   p = strstr(line, "script"); // Example: "PHP script, ASCII text"
  56   if (p) {
  57     do {
  58       p--;
  59       pe = p;
  60       while (*p == ' ') p--;
  61       while (p != line && isalnum(*(p - 1))) p--;
  62       if (p != line && *(p - 1) == '-') p--;
  63     } while (*p == '-'); // Skip over any switches.
  64     length = pe - p;
  65     strncpy(buf, p, length);
  66     buf[length] = '\0';
  67     struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
  68     if (rl) return(rl->name);
  69   } else if (strstr(line, "xml")) return(LANG_XML);
  70
  71   return NULL;
  72 }
  73
  74 /* Use libmagic to detect file language
  75  */
  76 const char *detect_language_magic(SourceFile *sourcefile) {
  77   char line[80];
  78
  79   magic_t cookie = magic_open(MAGIC_NONE);
  80   if (cookie == NULL) {
  81     fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
  82     exit(1);
  83   }
  84   if (magic_load(cookie, NULL) != 0) {
  85     fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
  86     magic_close(cookie);
  87     exit(1);
  88   }
  89
  90   if (sourcefile->diskpath) {
  91     const char *magic = magic_file(cookie, sourcefile->diskpath);
  92     if (magic == NULL) {
  93       fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
  94       magic_close(cookie);
  95       exit(1);
  96     }
  97     strncpy(line, magic, sizeof(line));
  98     line[sizeof(line)-1] = '\0';
  99   } else {
 100     char *p = ohcount_sourcefile_get_contents(sourcefile);
 101     if (!p) return NULL;
 102
 103     const char *magic = magic_buffer(cookie, p, strlen(p));
 104     if (magic == NULL) {
 105       fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
 106       magic_close(cookie);
 107       exit(1);
 108     }
 109     strncpy(line, magic, sizeof(line));
 110     line[sizeof(line)-1] = '\0';
 111   }
 112
 113   magic_close(cookie);
 114
 115   return magic_parse(line);
 116 }
 117
 118 /* Use all available means to detect file language
 119  */
 120 const char *ohcount_detect_language(SourceFile *sourcefile) {
 121   const char *language = NULL;
 122   char *p, *pe;
 123   int length;
 124
 125   // Attempt to detect based on file extension.
 126   length = strlen(sourcefile->ext);
 127   struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
 128                                                                length);
 129   if (re) language = re->value;
 130   if (!language) {
 131     // Try the lower-case version of this extension.
 132     char lowerext[length + 1];
 133     strncpy(lowerext, sourcefile->ext, length);
 134     lowerext[length] = '\0';
 135     for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
 136     struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
 137     if (re) language = re->value;
 138   }
 139
 140   // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
 141   if(!language) {
 142     char line[81] = { '\0' }, buf[81];
 143     p = ohcount_sourcefile_get_contents(sourcefile);
 144     pe = p;
 145     char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 146     while (pe < eof) {
 147       // Get the contents of the first line.
 148       while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 149       length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 150       strncpy(line, p, length);
 151       line[length] = '\0';
 152       if (*line == '#' && *(line + 1) == '!') {
 153         // First line was sh-bang; loop to get contents of second line.
 154         while (*pe == '\r' || *pe == '\n') pe++;
 155         p = pe;
 156       } else break;
 157     }
 158     p = strstr(line, "-*-");
 159     if (p) {
 160       p += 3;
 161       while (*p == ' ' || *p == '\t') p++;
 162       // detect "mode" (any capitalization)
 163       if (strncasecmp(p, "mode", 4) == 0) {
 164         p += 4;
 165         while (*p == ' ' || *p == '\t' || *p == ':') p++;
 166       }
 167       pe = p;
 168       while (!isspace(*pe) && *pe != ';' && pe != strstr(pe, "-*-")) pe++;
 169       length = (pe - p <= sizeof(buf)) ? pe - p : sizeof(buf);
 170       strncpy(buf, p, length);
 171       buf[length] = '\0';
 172
 173                   // Special case for "c" or "C" emacs mode header: always means C, not C++
 174                   if (strcasecmp(buf, "c") == 0) {
 175                                 return LANG_C;
 176                   }
 177
 178       // First try it with the language name.
 179       struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
 180       if (rl) language = rl->name;
 181       if(!language) {
 182         // Then try it with the extension table.
 183         struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 184         if (re) language = re->value;
 185       }
 186       if (!language) {
 187         // Try the lower-case version of this modeline.
 188         for (pe = buf; pe < buf+length; pe++) *pe = tolower(*pe);
 189         // First try it with the language name.
 190         rl = ohcount_hash_language_from_name(buf, length);
 191         if (rl) language = rl->name;
 192       }
 193       if (!language) {
 194         // Then try it with the extension table.
 195         struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 196         if (re) language = re->value;
 197       }
 198     }
 199   }
 200
 201   // Attempt to detect based on filename.
 202   if(!language) {
 203     length = strlen(sourcefile->filename);
 204     struct FilenameMap *rf =
 205       ohcount_hash_language_from_filename(sourcefile->filename, length);
 206     if (rf) language = rf->value;
 207   }
 208
 209   // Attempt to detect based on Unix 'file' command.
 210   if(!language) {
 211     language = detect_language_magic(sourcefile);
 212   }
 213
 214   if (language) {
 215     if (ISAMBIGUOUS(language)) {
 216       // Call the appropriate function for disambiguation.
 217       length = strlen(DISAMBIGUATEWHAT(language));
 218       struct DisambiguateFuncsMap *rd =
 219         ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
 220                                                length);
 221       if (rd) language = rd->value(sourcefile);
 222     } else language = ISBINARY(language) ? NULL : language;
 223   }
 224   return language;
 225 }
 226
 227 const char *disambiguate_aspx(SourceFile *sourcefile) {
 228   char *p = ohcount_sourcefile_get_contents(sourcefile);
 229   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 230   for (; p < eof; p++) {
 231     // /<%@\s*Page[^>]+Language="VB"[^>]+%>/
 232     p = strstr(p, "<%@");
 233     if (!p)
 234                         break;
 235     char *pe = strstr(p, "%>");
 236     if (p && pe) {
 237       p += 3;
 238       const int length = pe - p;
 239       char buf[length];
 240       strncpy(buf, p, length);
 241       buf[length] = '\0';
 242       char *eol = buf + strlen(buf);
 243       for (p = buf; p < eol; p++) *p = tolower(*p);
 244       p = buf;
 245       while (*p == ' ' || *p == '\t') p++;
 246       if (strncmp(p, "page", 4) == 0) {
 247         p += 4;
 248         if (strstr(p, "language=\"vb\""))
 249           return LANG_VB_ASPX;
 250       }
 251     }
 252   }
 253   return LANG_CS_ASPX;
 254 }
 255
 256 // 6502 assembly or XML-based Advanced Stream Redirector ?
 257 const char *disambiguate_asx(SourceFile *sourcefile) {
 258   char *p = ohcount_sourcefile_get_contents(sourcefile);
 259   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 260   for (; p < eof; p++) {
 261     switch (*p) {
 262     case ' ':
 263     case '\t':
 264     case '\n':
 265     case '\r':
 266       break;
 267     case '<':
 268     case '\0':
 269     // byte-order marks:
 270     case (char) 0xef:
 271     case (char) 0xfe:
 272     case (char) 0xff:
 273       return NULL; // XML
 274     default:
 275       return LANG_ASSEMBLER;
 276     }
 277   }
 278   return LANG_ASSEMBLER; // only blanks - not valid XML, may be valid asm
 279 }
 280
 281 const char *disambiguate_b(SourceFile *sourcefile) {
 282   char *p = ohcount_sourcefile_get_contents(sourcefile);
 283   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 284   while (p < eof) {
 285     // /(implement[ \t])|(include[ \t]+"[^"]*";)|
 286     //  ((return|break|continue).*;|(pick|case).*\{)/
 287     if (strncmp(p, "implement", 9) == 0 &&
 288         (*(p + 9) == ' ' || *(p + 9) == '\t'))
 289       return LANG_LIMBO;
 290     else if (strncmp(p, "include", 7) == 0 &&
 291         (*(p + 7) == ' ' || *(p + 7) == '\t')) {
 292       p += 7;
 293       while (*p == ' ' || *p == '\t') p++;
 294       if (*p == '"') {
 295         while (*p != '"' && p < eof) p++;
 296         if (*p == '"' && *(p + 1) == ';')
 297           return LANG_LIMBO;
 298       }
 299     } else if (strncmp(p, "return", 6) == 0 ||
 300                strncmp(p, "break", 5) == 0 ||
 301                strncmp(p, "continue", 8) == 0) {
 302       if (strstr(p, ";"))
 303         return LANG_LIMBO;
 304     } else if (strncmp(p, "pick", 4) == 0 ||
 305                strncmp(p, "case", 4) == 0) {
 306       if (strstr(p, "{"))
 307         return LANG_LIMBO;
 308     }
 309     p++;
 310   }
 311   return disambiguate_basic(sourcefile);
 312 }
 313
 314 const char *disambiguate_basic(SourceFile *sourcefile) {
 315   char *p, *pe;
 316   int length;
 317
 318   // Attempt to detect based on file contents.
 319   char line[81];
 320   p = ohcount_sourcefile_get_contents(sourcefile);
 321   pe = p;
 322   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 323   while (pe < eof) {
 324     // Get a line at a time.
 325     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 326     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 327     strncpy(line, p, length);
 328     line[length] = '\0';
 329     char *line_end = pe;
 330
 331     p = line;
 332     if (isdigit(*p)) {
 333       // /^\d+\s+\w/
 334       p++;
 335       while (isdigit(*p)) p++;
 336       if (*p == ' ' || *p == '\t') {
 337         p++;
 338         while (*p == ' ' || *p == '\t') p++;
 339         if (isalnum(*p))
 340           return LANG_CLASSIC_BASIC;
 341       }
 342     }
 343
 344     // Next line.
 345     pe = line_end;
 346     while (*pe == '\r' || *pe == '\n') pe++;
 347     p = pe;
 348   }
 349
 350   // Attempt to detect from associated VB files in file context.
 351   char **filenames = sourcefile->filenames;
 352   if (filenames) {
 353     int i;
 354     for (i = 0; filenames[i] != NULL; i++) {
 355       pe = filenames[i] + strlen(filenames[i]);
 356       p = pe;
 357       while (p > filenames[i] && *(p - 1) != '.') p--;
 358       length = pe - p;
 359       if (length == 3 &&
 360           (strncmp(p, "frm", length) == 0 ||
 361            strncmp(p, "frx", length) == 0 ||
 362            strncmp(p, "vba", length) == 0 ||
 363            strncmp(p, "vbp", length) == 0 ||
 364            strncmp(p, "vbs", length) == 0)) {
 365         return LANG_VISUALBASIC;
 366       }
 367     }
 368   }
 369
 370   return LANG_STRUCTURED_BASIC;
 371 }
 372
 373 const char *disambiguate_cs(SourceFile *sourcefile) {
 374   // Attempt to detect based on file contents.
 375         char *contents = ohcount_sourcefile_get_contents(sourcefile);
 376   if (contents && strstr(contents, "<?cs"))
 377     return LANG_CLEARSILVER_TEMPLATE;
 378   else
 379     return LANG_CSHARP;
 380 }
 381
 382 const char *disambiguate_def(SourceFile *sourcefile) {
 383   char *p = ohcount_sourcefile_get_contents(sourcefile);
 384   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 385   for (; p < eof; p++) {
 386     switch (*p) {
 387     case ' ':
 388     case '\t':
 389     case '\n':
 390     case '\r':
 391       break;
 392     case '(':
 393       if (p[1] == '*') // Modula-2 comment
 394         return LANG_MODULA2;
 395       return NULL;
 396     case 'D':
 397       if (strncmp(p, "DEFINITION", 10) == 0) // Modula-2 "DEFINITION MODULE"
 398         return LANG_MODULA2;
 399       return NULL;
 400     default:
 401       return NULL; // not Modula-2
 402     }
 403   }
 404   return NULL; // only blanks
 405 }
 406
 407 const char *disambiguate_fortran(SourceFile *sourcefile) {
 408   char *p;
 409
 410   p = ohcount_sourcefile_get_contents(sourcefile);
 411   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 412
 413   // Try the assumption of a fixed formatted source code, and return free
 414   // format if anything opposes this assumption.
 415   // Rules based on the Fortran standard, page 47:
 416   // ftp://ftp.nag.co.uk/sc22wg5/N1801-N1850/N1830.pdf
 417   while (p < eof) {
 418     int i = 1;
 419     int blanklabel;
 420     // Process a single line; tabulators are not valid in Fortran code
 421     // but some compilers accept them to skip the first 5 columns.
 422     if (*p == ' ' || *p == '\t' || isdigit(*p)) {
 423       // Only consider lines starting with a blank or digit
 424       // (non-comment in fixed)
 425       if (*p == '\t') i = 5;
 426       blanklabel = (*p == ' ' || *p == '\t');
 427       while (*p != '\r' && *p != '\n' && p < eof) {
 428         p++; i++;
 429         if (i <= 5) {
 430           blanklabel = blanklabel && (*p == ' ');
 431           if ( !isdigit(*p) && *p != ' ' && *p != '!')
 432             // Non-digit, non-blank, non-comment character in the label field
 433             // definetly not valid fixed formatted code!
 434             return LANG_FORTRANFREE;
 435         }
 436         if ((i == 6) && !blanklabel && *p != ' ' && *p != '0')
 437           // Fixed format continuation line with non-blank label field
 438           // not allowed, assume free format:
 439           return LANG_FORTRANFREE;
 440         // Ignore comments (a ! character in column 6 is a continuation in
 441         // fixed form)
 442         if (*p == '!' && i != 6) {
 443           while (*p != '\r' && *p != '\n' && p < eof) p++;
 444         } else {
 445           // Ignore quotes
 446           if (*p == '"') {
 447             if (p < eof) {p++; i++;}
 448             while (*p != '"' && *p != '\r' && *p != '\n' && p < eof) {
 449               p++; i++;
 450             }
 451           }
 452           if (*p == '\'') {
 453             if (p < eof) {p++; i++;}
 454             while (*p != '\'' && *p != '\r' && *p != '\n' && p < eof) {
 455               p++; i++;
 456             }
 457           }
 458           // Check for free format line continuation
 459           if (i > 6 && i <= 72 && *p == '&')
 460             // Found an unquoted free format continuation character in the fixed
 461             // format code section. This has to be free format.
 462             return LANG_FORTRANFREE;
 463         }
 464       }
 465     } else {
 466       // Not a statement line in fixed format...
 467       if (*p != 'C' && *p != 'c' && *p != '*' && *p != '!')
 468         // Not a valid fixed form comment, has to be free formatted source
 469         return LANG_FORTRANFREE;
 470       // Comment in fixed form, ignore this line
 471       while (*p != '\r' && *p != '\n' && p < eof) p++;
 472     }
 473     // Skip all line ends
 474     while ((*p == '\r' || *p == '\n') && p < eof) p++;
 475   }
 476   // Assume fixed format if none of the lines broke the assumptions
 477   return LANG_FORTRANFIXED;
 478 }
 479
 480 const char *disambiguate_h(SourceFile *sourcefile) {
 481   char *p, *pe, *bof;
 482   int length;
 483
 484   // If the directory contains a matching *.m file, likely Objective C.
 485   length = strlen(sourcefile->filename);
 486   if (strcmp(sourcefile->ext, "h") == 0) {
 487     char path[length];
 488     strncpy(path, sourcefile->filename, length);
 489     path[length] = '\0';
 490     *(path + length - 1) = 'm';
 491     char **filenames = sourcefile->filenames;
 492     if (filenames) {
 493       int i;
 494       for (i = 0; filenames[i] != NULL; i++)
 495         if (strcmp(path, filenames[i]) == 0)
 496           return LANG_OBJECTIVE_C;
 497     }
 498   }
 499
 500   // Attempt to detect based on file contents.
 501   char line[81], buf[81];
 502   bof = ohcount_sourcefile_get_contents(sourcefile);
 503   p = bof;
 504   pe = p;
 505   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 506   while (pe < eof) {
 507     // Get a line at a time.
 508     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 509     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 510     strncpy(line, p, length);
 511     line[length] = '\0';
 512     char *eol = line + strlen(line);
 513     char *line_end = pe;
 514
 515     // Look for C++ headers.
 516     if (*line == '#') {
 517       p = line + 1;
 518       while (*p == ' ' || *p == '\t') p++;
 519       if (strncmp(p, "include", 7) == 0 &&
 520           (*(p + 7) == ' ' || *(p + 7) == '\t')) {
 521         // /^#\s*include\s+[<"][^>"]+[>"]/
 522         p += 8;
 523         while (*p == ' ' || *p == '\t') p++;
 524         if (*p == '<' || *p == '"') {
 525           // Is the header file a C++ header file?
 526           p++;
 527           pe = p;
 528           while (pe < eol && *pe != '>' && *pe != '"') pe++;
 529           length = pe - p;
 530           strncpy(buf, p, length);
 531           buf[length] = '\0';
 532           if (ohcount_hash_is_cppheader(buf, length))
 533             return LANG_CPP;
 534           // Is the extension for the header file a C++ file?
 535           p = pe;
 536           while (p > line && *(p - 1) != '.') p--;
 537           length = pe - p;
 538           strncpy(buf, p, length);
 539           buf[length] = '\0';
 540           struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 541           if (re && strcmp(re->value, LANG_CPP) == 0)
 542             return LANG_CPP;
 543         }
 544       }
 545     }
 546
 547     // Look for C++ keywords.
 548     p = line;
 549     while (p < eol) {
 550       if (islower(*p) && p != bof && !isalnum(*(p - 1)) && *(p - 1) != '_') {
 551         pe = p;
 552         while (islower(*pe)) pe++;
 553         if (!isalnum(*pe) && *pe != '_') {
 554           length = pe - p;
 555           strncpy(buf, p, length);
 556           buf[length] = '\0';
 557           if (strcmp(buf, "class") == 0 ||
 558               strcmp(buf, "namespace") == 0 ||
 559               strcmp(buf, "template") == 0 ||
 560               strcmp(buf, "typename") == 0)
 561             return LANG_CPP;
 562         }
 563         p = pe + 1;
 564       } else p++;
 565     }
 566
 567     // Next line.
 568     pe = line_end;
 569     while (*pe == '\r' || *pe == '\n') pe++;
 570     p = pe;
 571   }
 572
 573   // Nothing to suggest C++.
 574   return LANG_C;
 575 }
 576
 577 const char *disambiguate_in(SourceFile *sourcefile) {
 578   char *p, *pe;
 579   int length;
 580   const char *language = NULL;
 581
 582   p = sourcefile->filepath;
 583   pe = p + strlen(p) - 3;
 584   if (strstr(p, ".") <= pe) {
 585     // Only if the filename has an extension prior to the .in
 586     length = pe - p;
 587     char buf[length];
 588     strncpy(buf, p, length);
 589     buf[length] = '\0';
 590     p = ohcount_sourcefile_get_contents(sourcefile);
 591                 if (!p) {
 592                         return NULL;
 593                 }
 594
 595     // A SourceFile's filepath and diskpath need not be the same.
 596     // Here, we'll take advantage of this to set up a new SourceFile
 597     // whose filepath does not have the *.in extension, but whose
 598     // diskpath still points back to the original file on disk (if any).
 599     SourceFile *undecorated = ohcount_sourcefile_new(buf);
 600     if (sourcefile->diskpath) {
 601       ohcount_sourcefile_set_diskpath(undecorated, sourcefile->diskpath);
 602     }
 603     ohcount_sourcefile_set_contents(undecorated, p);
 604                 undecorated->filenames = sourcefile->filenames;
 605     language = ohcount_sourcefile_get_language(undecorated);
 606     ohcount_sourcefile_free(undecorated);
 607   }
 608   return language;
 609 }
 610
 611 const char *disambiguate_inc(SourceFile *sourcefile) {
 612   char *p = ohcount_sourcefile_get_contents(sourcefile);
 613         if (p) {
 614                 char *eof = p + strlen(p);
 615                 while (p < eof) {
 616                         if (*p == '\0')
 617                                 return BINARY;
 618                         else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
 619                                 return LANG_PHP;
 620                         p++;
 621                 }
 622         }
 623   return NULL;
 624 }
 625
 626 const char *disambiguate_m(SourceFile *sourcefile) {
 627   char *p, *pe;
 628   int length;
 629
 630   // Attempt to detect based on a weighted heuristic of file contents.
 631   int matlab_score = 0;
 632   int objective_c_score = 0;
 633   int limbo_score = 0;
 634   int octave_syntax_detected = 0;
 635
 636   int i, has_h_headers = 0, has_c_files = 0;
 637   char **filenames = sourcefile->filenames;
 638   if (filenames) {
 639     for (i = 0; filenames[i] != NULL; i++) {
 640       p = filenames[i];
 641       pe = p + strlen(p);
 642       if (pe - p >= 4) {
 643         if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
 644             ((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
 645              (*(pe - 2) == '+' && *(pe - 1) == '+') ||
 646              (*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
 647           has_c_files = 1;
 648           break; // short circuit
 649         }
 650       } else if (pe - p >= 3) {
 651         if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
 652           has_c_files = 1;
 653           break; // short circuit
 654         }
 655       } else if (pe - p >= 2) {
 656         if (*(pe - 2) == '.') {
 657           if (*(pe - 1) == 'h')
 658             has_h_headers = 1;
 659           else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
 660             has_c_files = 1;
 661             break; // short circuit
 662           }
 663         }
 664       }
 665     }
 666   }
 667   if (has_h_headers && !has_c_files)
 668     objective_c_score += 5;
 669
 670   char line[81], buf[81];
 671   p = ohcount_sourcefile_get_contents(sourcefile);
 672   pe = p;
 673   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 674   while (pe < eof) {
 675     // Get a line at a time.
 676     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 677     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 678     strncpy(line, p, length);
 679     line[length] = '\0';
 680     char *eol = line + strlen(line);
 681     char *line_end = pe;
 682
 683     // Look for tell-tale lines.
 684     p = line;
 685     while (*p == ' ' || *p == '\t') p++;
 686     if (*p == '%') { // Matlab comment
 687       matlab_score++;
 688                 } else if (*p == '#' && strncmp(p, "#import", 7) == 0) { // Objective C
 689                         objective_c_score++;
 690     } else if (*p == '#') { // Limbo or Octave comment
 691       while (*p == '#') p++;
 692       if (*p == ' ' || *p == '\t') {
 693         limbo_score++;
 694         matlab_score++;
 695         octave_syntax_detected = 1;
 696       }
 697     } else if (*p == '/' && *(p + 1) == '/' || *(p + 1) == '*') {
 698       objective_c_score++; // Objective C comment
 699     } else if (*p == '+' || *p == '-') { // Objective C method signature
 700       objective_c_score++;
 701     } else if (*p == '@' || *p == '#') { // Objective C method signature
 702       if (strncmp(p, "@implementation", 15) == 0 ||
 703           strncmp(p, "@interface", 10) == 0)
 704         objective_c_score++;
 705     } else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
 706       p += 8;
 707       while (*p == ' ' || *p == '\t') p++;
 708       if (*p == '(')
 709         matlab_score++;
 710     } else if (strncmp(p, "include", 7) == 0) { // Limbo include
 711       // /^include[ \t]+"[^"]+\.m";/
 712       p += 7;
 713       if (*p == ' ' || *p == '\t') {
 714         while (*p == ' ' || *p == '\t') p++;
 715         if (*p == '"') {
 716           while (*p != '"' && p < eol) p++;
 717           if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
 718             limbo_score++;
 719         }
 720       }
 721     }
 722
 723     // Look for Octave keywords.
 724     p = line;
 725     while (p < eol) {
 726       if (islower(*p) && p != line && !isalnum(*(p - 1))) {
 727         pe = p;
 728         while (islower(*pe) || *pe == '_') pe++;
 729         if (!isalnum(*pe)) {
 730           length = pe - p;
 731           strncpy(buf, p, length);
 732           buf[length] = '\0';
 733           if (strcmp(buf, "end_try_catch") == 0 ||
 734               strcmp(buf, "end_unwind_protect") == 0 ||
 735               strcmp(buf, "endfunction") == 0 ||
 736               strcmp(buf, "endwhile") == 0)
 737             octave_syntax_detected = 1;
 738         }
 739         p = pe + 1;
 740       } else p++;
 741     }
 742
 743     // Look for Limbo declarations
 744     p = line;
 745     while (p < eol) {
 746       if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
 747         // /:[ \t]+(module|adt|fn ?\(|con[ \t])/
 748         p += 2;
 749         if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
 750             strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
 751             strncmp(p, "fn", 2) == 0 &&
 752               (*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
 753             strncmp(p, "con", 3) == 0 &&
 754               (*(p + 3) == ' ' || *(p + 3) == '\t'))
 755           limbo_score++;
 756       } else p++;
 757     }
 758
 759     // Next line.
 760     pe = line_end;
 761     while (*pe == '\r' || *pe == '\n') pe++;
 762     p = pe;
 763   }
 764
 765   if (limbo_score > objective_c_score && limbo_score > matlab_score)
 766     return LANG_LIMBO;
 767   else if (objective_c_score > matlab_score)
 768     return LANG_OBJECTIVE_C;
 769   else
 770     return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
 771 }
 772
 773 #include <pcre.h>
 774
 775 // strnlen is not available on OS X, so we roll our own
 776 size_t mystrnlen(const char *begin, size_t maxlen) {
 777   if (begin == NULL)
 778     return 0;
 779   const char *end = memchr(begin, '\0', maxlen);
 780   return end ? (end - begin) : maxlen;
 781 }
 782
 783 const char *disambiguate_pp(SourceFile *sourcefile) {
 784         char *p = ohcount_sourcefile_get_contents(sourcefile);
 785
 786         if (!p)
 787           return NULL;
 788
 789         /* prepare regular expressions */
 790         const char *error;
 791         int erroffset;
 792
 793         /* try harder with optional spaces */
 794         pcre *keyword;
 795         keyword = pcre_compile("^\\s*(ensure|content|notify|require|source)\\s+=>",
 796                         PCRE_MULTILINE, &error, &erroffset, NULL);
 797
 798         if (pcre_exec(keyword, NULL, p, mystrnlen(p, 10000), 0, 0, NULL, 0) > -1)
 799                 return LANG_PUPPET;
 800
 801         /* check for standard puppet constructs */
 802         pcre *construct;
 803         construct = pcre_compile("^\\s*(define\\s+[\\w:-]+\\s*\\(|class\\s+[\\w:-]+(\\s+inherits\\s+[\\w:-]+)?\\s*{|node\\s+\\'?[\\w:\\.-]+\\'?\\s*{|import\\s+\")",
 804                         PCRE_MULTILINE, &error, &erroffset, NULL);
 805
 806         if (pcre_exec(construct, NULL, p, mystrnlen(p, 10000), 0, 0, NULL, 0) > -1)
 807                 return LANG_PUPPET;
 808
 809         return LANG_PASCAL;
 810 }
 811
 812 const char *disambiguate_pl(SourceFile *sourcefile) {
 813         char *contents = ohcount_sourcefile_get_contents(sourcefile);
 814   if (!contents)
 815     return NULL;
 816
 817   // Check for a perl shebang on first line of file
 818         const char *error;
 819         int erroffset;
 820         pcre *re = pcre_compile("#![^\\n]*perl", PCRE_CASELESS, &error, &erroffset, NULL);
 821   if (pcre_exec(re, NULL, contents, mystrnlen(contents, 100), 0, PCRE_ANCHORED, NULL, 0) > -1)
 822     return LANG_PERL;
 823
 824   // Check for prolog :- rules
 825   if (strstr(contents, ":- ") || strstr(contents, ":-\n"))
 826     return LANG_PROLOG;
 827
 828   // Perl by default.
 829   return LANG_PERL;
 830 }
 831
 832 #define QMAKE_SOURCES_SPACE "SOURCES +="
 833 #define QMAKE_SOURCES "SOURCES+="
 834 #define QMAKE_CONFIG_SPACE "CONFIG +="
 835 #define QMAKE_CONFIG "CONFIG+="
 836
 837 const char *disambiguate_pro(SourceFile *sourcefile) {
 838         char *p = ohcount_sourcefile_get_contents(sourcefile);
 839   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 840         for (; p < eof; p++) {
 841                 if (strncmp(p, QMAKE_SOURCES_SPACE, strlen(QMAKE_SOURCES_SPACE)) == 0 ||
 842                                 strncmp(p, QMAKE_SOURCES, strlen(QMAKE_SOURCES)) == 0 ||
 843                                 strncmp(p, QMAKE_CONFIG_SPACE, strlen(QMAKE_CONFIG_SPACE)) == 0 ||
 844                                 strncmp(p, QMAKE_CONFIG, strlen(QMAKE_CONFIG)) == 0)
 845                         return LANG_MAKE; // really QMAKE
 846         }
 847         return LANG_IDL_PVWAVE;
 848 }
 849
 850 const char *disambiguate_r(SourceFile *sourcefile) {
 851   char *contents = ohcount_sourcefile_get_contents(sourcefile);
 852   if (!contents)
 853     return LANG_R;
 854
 855   char *eof = contents + ohcount_sourcefile_get_contents_size(sourcefile);
 856
 857   // Detect REBOL by looking for the occurence of "rebol" in the contents
 858   // (case-insensitive). Correct REBOL scripts have a "REBOL [...]" header
 859   // block.
 860   char *needle = "rebol";
 861   int len = strlen(needle);
 862   for (; contents < eof - len; ++contents)
 863     if (tolower(*contents) == *needle &&
 864           !strncasecmp(contents, needle, len))
 865       return LANG_REBOL;
 866
 867   return LANG_R;
 868 }
 869
 870 const char *disambiguate_rs(SourceFile *sourcefile) {
 871   // .rs is normally Rust, but it might be RenderScript. RenderScript is
 872   // expected to have a "#pragma version(1)" line at the very start, possibly
 873   // after comments. To help with supporting future versions of RenderScript,
 874   // we'll skip the number part.
 875   // As RenderScript is not implemented in ohcount yet, it's returned as NULL.
 876   char *contents = ohcount_sourcefile_get_contents(sourcefile);
 877   if (!contents) {
 878     return LANG_RUST;
 879   }
 880
 881   char *needle = "\n#pragma version";
 882   int len = strlen(needle);
 883   if (strncasecmp(contents, needle + 1, len - 1) == 0) {
 884     // "#pragma version" at the very start of the file is RenderScript.
 885     return NULL;
 886   }
 887
 888   char *eof = contents + ohcount_sourcefile_get_contents_size(sourcefile);
 889
 890   for (; contents < eof - len; ++contents) {
 891     if (!strncmp(contents, needle, len)) {
 892       return NULL;
 893     }
 894   }
 895
 896   return LANG_RUST;
 897 }
 898
 899 const char *disambiguate_st(SourceFile *sourcefile) {
 900   char *p, *pe;
 901   int length;
 902
 903   // Attempt to detect based on file contents.
 904   int found_assignment = 0, found_block_start = 0, found_block_end = 0;
 905
 906   char line[81];
 907   p = ohcount_sourcefile_get_contents(sourcefile);
 908   pe = p;
 909   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 910   while (pe < eof) {
 911     // Get a line at a time.
 912     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 913     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 914     strncpy(line, p, length);
 915     line[length] = '\0';
 916     char *eol = line + strlen(line);
 917     char *line_end = pe;
 918
 919     for (p = line; p < eol; p++) {
 920       if (*p == ':') {
 921         p++;
 922         while (p < eol && (*p == ' ' || *p == '\t')) p++;
 923         if (*p == '=')
 924           found_assignment = 1;
 925         else if (*p == '[')
 926           found_block_start = 1;
 927       } else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
 928       if (found_assignment && found_block_start && found_block_end)
 929         return LANG_SMALLTALK;
 930     }
 931
 932     // Next line.
 933     pe = line_end;
 934     while (*pe == '\r' || *pe == '\n') pe++;
 935     p = pe;
 936   }
 937
 938   return NULL;
 939 }
 940
 941 int ohcount_is_binary_filename(const char *filename) {
 942   char *p = (char *)filename + strlen(filename);
 943   while (p > filename && *(p - 1) != '.') p--;
 944   if (p > filename) {
 945     struct ExtensionMap *re;
 946     int length = strlen(p);
 947     re = ohcount_hash_language_from_ext(p, length);
 948     if (re) return ISBINARY(re->value);
 949     // Try the lower-case version of this extension.
 950     char lowerext[length];
 951     strncpy(lowerext, p, length);
 952     lowerext[length] = '\0';
 953     for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
 954     re = ohcount_hash_language_from_ext(lowerext, length);
 955     if (re) return ISBINARY(re->value);
 956   }
 957   return 0;
 958 }