git.oblomov.eu Git - ohcount/blob - src/detector.c

   1 // detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
   2 // See COPYING for license information.
   3
   4 #include <ctype.h>
   5 #include <magic.h>
   6 #include <stdio.h>
   7 #include <stdlib.h>
   8 #include <string.h>
   9 #include <unistd.h>
  10
  11 #include "detector.h"
  12 #include "languages.h"
  13 #include "log.h"
  14
  15 #include "hash/cppheader_hash.h"
  16 #include "hash/disambiguatefunc_hash.h"
  17 #include "hash/extension_hash.h"
  18 #include "hash/filename_hash.h"
  19
  20 #define ISBINARY(x) (x[0] == '\1')
  21 #define ISAMBIGUOUS(x) (x[0] == '\2')
  22 #define DISAMBIGUATEWHAT(x) &x[1]
  23
  24 #ifdef _WIN32
  25 # include <fcntl.h>
  26 # define mkstemp(p) _open(_mktemp(p), _O_CREAT | _O_SHORT_LIVED | _O_EXCL)
  27 #endif
  28
  29 /* Parse the output of libmagic and return a language, if any.
  30  * The contents of string `line` will be destroyed.
  31  */
  32 const char *magic_parse(char *line) {
  33   char *p, *pe;
  34   char *eol = line + strlen(line);
  35
  36   char buf[80];
  37   size_t length;
  38
  39   for (p = line; p < eol; p++) *p = tolower(*p);
  40   p = strstr(line, "script text");
  41   if (p && p == line) { // /^script text(?: executable)? for \w/
  42     p = strstr(line, "for ");
  43     if (p) {
  44       p += 4;
  45       pe = p;
  46       while (isalnum(*pe)) pe++;
  47       length = pe - p;
  48       strncpy(buf, p, length);
  49       buf[length] = '\0';
  50       struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
  51       if (rl) return(rl->name);
  52     }
  53   } else if (p) { // /(\w+)(?: -\w+)* script text/
  54     do {
  55       p--;
  56       pe = p;
  57       while (*p == ' ') p--;
  58       while (p != line && isalnum(*(p - 1))) p--;
  59       if (p != line && *(p - 1) == '-') p--;
  60     } while (*p == '-'); // Skip over any switches.
  61     length = pe - p;
  62     strncpy(buf, p, length);
  63     buf[length] = '\0';
  64     struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
  65     if (rl) return(rl->name);
  66   } else if (strstr(line, "xml")) return(LANG_XML);
  67
  68   return NULL;
  69 }
  70
  71 /* Use libmagic to detect file language
  72  */
  73 const char *detect_language_magic(SourceFile *sourcefile) {
  74   char line[80];
  75
  76   magic_t cookie = magic_open(MAGIC_NONE);
  77   if (cookie == NULL) {
  78     fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
  79     exit(1);
  80   }
  81   if (magic_load(cookie, NULL) != 0) {
  82     fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
  83     magic_close(cookie);
  84     exit(1);
  85   }
  86
  87   if (sourcefile->diskpath) {
  88     const char *magic = magic_file(cookie, sourcefile->diskpath);
  89     if (magic == NULL) {
  90       fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
  91       magic_close(cookie);
  92       exit(1);
  93     }
  94     strncpy(line, magic, sizeof(line));
  95     line[sizeof(line)-1] = '\0';
  96   } else {
  97     char *p = ohcount_sourcefile_get_contents(sourcefile);
  98     if (!p) return NULL;
  99
 100     const char *magic = magic_buffer(cookie, p, strlen(p));
 101     if (magic == NULL) {
 102       fprintf(stderr, "libmagic: %s\n", magic_error(cookie));
 103       magic_close(cookie);
 104       exit(1);
 105     }
 106     strncpy(line, magic, sizeof(line));
 107     line[sizeof(line)-1] = '\0';
 108   }
 109
 110   magic_close(cookie);
 111
 112   return magic_parse(line);
 113 }
 114
 115 /* Use all available means to detect file language
 116  */
 117 const char *ohcount_detect_language(SourceFile *sourcefile) {
 118   const char *language = NULL;
 119   char *p, *pe;
 120   int length;
 121
 122   // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
 123   char line[81] = { '\0' }, buf[81];
 124   p = ohcount_sourcefile_get_contents(sourcefile);
 125   pe = p;
 126   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 127   while (pe < eof) {
 128     // Get the contents of the first line.
 129     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 130     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 131     strncpy(line, p, length);
 132     line[length] = '\0';
 133     if (*line == '#' && *(line + 1) == '!') {
 134       // First line was sh-bang; loop to get contents of second line.
 135       while (*pe == '\r' || *pe == '\n') pe++;
 136       p = pe;
 137     } else break;
 138   }
 139   p = strstr(line, "-*-");
 140   if (p) {
 141     p += 3;
 142     while (*p == ' ' || *p == '\t') p++;
 143     // detect "mode" (any capitalization)
 144     if (strncasecmp(p, "mode", 4) == 0) {
 145       p += 4;
 146       while (*p == ' ' || *p == '\t' || *p == ':') p++;
 147     }
 148     pe = p;
 149     while (!isspace(*pe) && *pe != ';' && pe != strstr(pe, "-*-")) pe++;
 150     length = (pe - p <= sizeof(buf)) ? pe - p : sizeof(buf);
 151     strncpy(buf, p, length);
 152     buf[length] = '\0';
 153
 154                 // Special case for "c" or "C" emacs mode header: always means C, not C++
 155                 if (strcasecmp(buf, "c") == 0) {
 156                                 return LANG_C;
 157                 }
 158
 159     // First try it with the language name.
 160     struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
 161     if (rl) language = rl->name;
 162     if(!language) {
 163       // Then try it with the extension table.
 164       struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 165       if (re) language = re->value;
 166     }
 167     if (!language) {
 168       // Try the lower-case version of this modeline.
 169       for (pe = buf; pe < buf+length; pe++) *pe = tolower(*pe);
 170       // First try it with the language name.
 171       rl = ohcount_hash_language_from_name(buf, length);
 172       if (rl) language = rl->name;
 173     }
 174     if (!language) {
 175       // Then try it with the extension table.
 176       struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 177       if (re) language = re->value;
 178     }
 179   }
 180
 181   // Attempt to detect based on file extension.
 182   if(!language) {
 183       length = strlen(sourcefile->ext);
 184       struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
 185                                                                length);
 186       if (re) language = re->value;
 187     if (!language) {
 188       // Try the lower-case version of this extension.
 189       char lowerext[length + 1];
 190       strncpy(lowerext, sourcefile->ext, length);
 191       lowerext[length] = '\0';
 192       for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
 193       struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
 194       if (re) language = re->value;
 195     }
 196   }
 197
 198   // Attempt to detect based on filename.
 199   if(!language) {
 200     length = strlen(sourcefile->filename);
 201     struct FilenameMap *rf =
 202       ohcount_hash_language_from_filename(sourcefile->filename, length);
 203     if (rf) language = rf->value;
 204   }
 205
 206   // Attempt to detect based on Unix 'file' command.
 207   if(!language) {
 208     language = detect_language_magic(sourcefile);
 209   }
 210
 211   if (language) {
 212     if (ISAMBIGUOUS(language)) {
 213       // Call the appropriate function for disambiguation.
 214       length = strlen(DISAMBIGUATEWHAT(language));
 215       struct DisambiguateFuncsMap *rd =
 216         ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
 217                                                length);
 218       if (rd) language = rd->value(sourcefile);
 219     } else language = ISBINARY(language) ? NULL : language;
 220   }
 221   return language;
 222 }
 223
 224 const char *disambiguate_aspx(SourceFile *sourcefile) {
 225   char *p = ohcount_sourcefile_get_contents(sourcefile);
 226   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 227   for (; p < eof; p++) {
 228     // /<%@\s*Page[^>]+Language="VB"[^>]+%>/
 229     p = strstr(p, "<%@");
 230     if (!p)
 231                         break;
 232     char *pe = strstr(p, "%>");
 233     if (p && pe) {
 234       p += 3;
 235       const int length = pe - p;
 236       char buf[length];
 237       strncpy(buf, p, length);
 238       buf[length] = '\0';
 239       char *eol = buf + strlen(buf);
 240       for (p = buf; p < eol; p++) *p = tolower(*p);
 241       p = buf;
 242       while (*p == ' ' || *p == '\t') p++;
 243       if (strncmp(p, "page", 4) == 0) {
 244         p += 4;
 245         if (strstr(p, "language=\"vb\""))
 246           return LANG_VB_ASPX;
 247       }
 248     }
 249   }
 250   return LANG_CS_ASPX;
 251 }
 252
 253 // 6502 assembly or XML-based Advanced Stream Redirector ?
 254 const char *disambiguate_asx(SourceFile *sourcefile) {
 255   char *p = ohcount_sourcefile_get_contents(sourcefile);
 256   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 257   for (; p < eof; p++) {
 258     switch (*p) {
 259     case ' ':
 260     case '\t':
 261     case '\n':
 262     case '\r':
 263       break;
 264     case '<':
 265     case '\0':
 266     // byte-order marks:
 267     case (char) 0xef:
 268     case (char) 0xfe:
 269     case (char) 0xff:
 270       return NULL; // XML
 271     default:
 272       return LANG_ASSEMBLER;
 273     }
 274   }
 275   return LANG_ASSEMBLER; // only blanks - not valid XML, may be valid asm
 276 }
 277
 278 const char *disambiguate_b(SourceFile *sourcefile) {
 279   char *p = ohcount_sourcefile_get_contents(sourcefile);
 280   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 281   while (p < eof) {
 282     // /(implement[ \t])|(include[ \t]+"[^"]*";)|
 283     //  ((return|break|continue).*;|(pick|case).*\{)/
 284     if (strncmp(p, "implement", 9) == 0 &&
 285         (*(p + 9) == ' ' || *(p + 9) == '\t'))
 286       return LANG_LIMBO;
 287     else if (strncmp(p, "include", 7) == 0 &&
 288         (*(p + 7) == ' ' || *(p + 7) == '\t')) {
 289       p += 7;
 290       while (*p == ' ' || *p == '\t') p++;
 291       if (*p == '"') {
 292         while (*p != '"' && p < eof) p++;
 293         if (*p == '"' && *(p + 1) == ';')
 294           return LANG_LIMBO;
 295       }
 296     } else if (strncmp(p, "return", 6) == 0 ||
 297                strncmp(p, "break", 5) == 0 ||
 298                strncmp(p, "continue", 8) == 0) {
 299       if (strstr(p, ";"))
 300         return LANG_LIMBO;
 301     } else if (strncmp(p, "pick", 4) == 0 ||
 302                strncmp(p, "case", 4) == 0) {
 303       if (strstr(p, "{"))
 304         return LANG_LIMBO;
 305     }
 306     p++;
 307   }
 308   return disambiguate_basic(sourcefile);
 309 }
 310
 311 const char *disambiguate_basic(SourceFile *sourcefile) {
 312   char *p, *pe;
 313   int length;
 314
 315   // Attempt to detect based on file contents.
 316   char line[81];
 317   p = ohcount_sourcefile_get_contents(sourcefile);
 318   pe = p;
 319   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 320   while (pe < eof) {
 321     // Get a line at a time.
 322     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 323     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 324     strncpy(line, p, length);
 325     line[length] = '\0';
 326     char *line_end = pe;
 327
 328     p = line;
 329     if (isdigit(*p)) {
 330       // /^\d+\s+\w/
 331       p++;
 332       while (isdigit(*p)) p++;
 333       if (*p == ' ' || *p == '\t') {
 334         p++;
 335         while (*p == ' ' || *p == '\t') p++;
 336         if (isalnum(*p))
 337           return LANG_CLASSIC_BASIC;
 338       }
 339     }
 340
 341     // Next line.
 342     pe = line_end;
 343     while (*pe == '\r' || *pe == '\n') pe++;
 344     p = pe;
 345   }
 346
 347   // Attempt to detect from associated VB files in file context.
 348   char **filenames = sourcefile->filenames;
 349   if (filenames) {
 350     int i;
 351     for (i = 0; filenames[i] != NULL; i++) {
 352       pe = filenames[i] + strlen(filenames[i]);
 353       p = pe;
 354       while (p > filenames[i] && *(p - 1) != '.') p--;
 355       length = pe - p;
 356       if (length == 3 &&
 357           (strncmp(p, "frm", length) == 0 ||
 358            strncmp(p, "frx", length) == 0 ||
 359            strncmp(p, "vba", length) == 0 ||
 360            strncmp(p, "vbp", length) == 0 ||
 361            strncmp(p, "vbs", length) == 0)) {
 362         return LANG_VISUALBASIC;
 363       }
 364     }
 365   }
 366
 367   return LANG_STRUCTURED_BASIC;
 368 }
 369
 370 const char *disambiguate_cs(SourceFile *sourcefile) {
 371   // Attempt to detect based on file contents.
 372         char *contents = ohcount_sourcefile_get_contents(sourcefile);
 373   if (contents && strstr(contents, "<?cs"))
 374     return LANG_CLEARSILVER_TEMPLATE;
 375   else
 376     return LANG_CSHARP;
 377 }
 378
 379 const char *disambiguate_def(SourceFile *sourcefile) {
 380   char *p = ohcount_sourcefile_get_contents(sourcefile);
 381   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 382   for (; p < eof; p++) {
 383     switch (*p) {
 384     case ' ':
 385     case '\t':
 386     case '\n':
 387     case '\r':
 388       break;
 389     case '(':
 390       if (p[1] == '*') // Modula-2 comment
 391         return LANG_MODULA2;
 392       return NULL;
 393     case 'D':
 394       if (strncmp(p, "DEFINITION", 10) == 0) // Modula-2 "DEFINITION MODULE"
 395         return LANG_MODULA2;
 396       return NULL;
 397     default:
 398       return NULL; // not Modula-2
 399     }
 400   }
 401   return NULL; // only blanks
 402 }
 403
 404 const char *disambiguate_fortran(SourceFile *sourcefile) {
 405   char *p;
 406
 407   p = ohcount_sourcefile_get_contents(sourcefile);
 408   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 409
 410   // Try the assumption of a fixed formatted source code, and return free
 411   // format if anything opposes this assumption.
 412   // Rules based on the Fortran standard, page 47:
 413   // ftp://ftp.nag.co.uk/sc22wg5/N1801-N1850/N1830.pdf
 414   while (p < eof) {
 415     int i = 1;
 416     int blanklabel;
 417     // Process a single line; tabulators are not valid in Fortran code
 418     // but some compilers accept them to skip the first 5 columns.
 419     if (*p == ' ' || *p == '\t' || isdigit(*p)) {
 420       // Only consider lines starting with a blank or digit
 421       // (non-comment in fixed)
 422       if (*p == '\t') i = 5;
 423       blanklabel = (*p == ' ' || *p == '\t');
 424       while (*p != '\r' && *p != '\n' && p < eof) {
 425         p++; i++;
 426         if (i <= 5) {
 427           blanklabel = blanklabel && (*p == ' ');
 428           if ( !isdigit(*p) && *p != ' ' && *p != '!')
 429             // Non-digit, non-blank, non-comment character in the label field
 430             // definetly not valid fixed formatted code!
 431             return LANG_FORTRANFREE;
 432         }
 433         if ((i == 6) && !blanklabel && *p != ' ' && *p != '0')
 434           // Fixed format continuation line with non-blank label field
 435           // not allowed, assume free format:
 436           return LANG_FORTRANFREE;
 437         // Ignore comments (a ! character in column 6 is a continuation in
 438         // fixed form)
 439         if (*p == '!' && i != 6) {
 440           while (*p != '\r' && *p != '\n' && p < eof) p++;
 441         } else {
 442           // Ignore quotes
 443           if (*p == '"') {
 444             if (p < eof) {p++; i++;}
 445             while (*p != '"' && *p != '\r' && *p != '\n' && p < eof) {
 446               p++; i++;
 447             }
 448           }
 449           if (*p == '\'') {
 450             if (p < eof) {p++; i++;}
 451             while (*p != '\'' && *p != '\r' && *p != '\n' && p < eof) {
 452               p++; i++;
 453             }
 454           }
 455           // Check for free format line continuation
 456           if (i > 6 && i <= 72 && *p == '&')
 457             // Found an unquoted free format continuation character in the fixed
 458             // format code section. This has to be free format.
 459             return LANG_FORTRANFREE;
 460         }
 461       }
 462     } else {
 463       // Not a statement line in fixed format...
 464       if (*p != 'C' && *p != 'c' && *p != '*' && *p != '!')
 465         // Not a valid fixed form comment, has to be free formatted source
 466         return LANG_FORTRANFREE;
 467       // Comment in fixed form, ignore this line
 468       while (*p != '\r' && *p != '\n' && p < eof) p++;
 469     }
 470     // Skip all line ends
 471     while ((*p == '\r' || *p == '\n') && p < eof) p++;
 472   }
 473   // Assume fixed format if none of the lines broke the assumptions
 474   return LANG_FORTRANFIXED;
 475 }
 476
 477 const char *disambiguate_h(SourceFile *sourcefile) {
 478   char *p, *pe, *bof;
 479   int length;
 480
 481   // If the directory contains a matching *.m file, likely Objective C.
 482   length = strlen(sourcefile->filename);
 483   if (strcmp(sourcefile->ext, "h") == 0) {
 484     char path[length];
 485     strncpy(path, sourcefile->filename, length);
 486     path[length] = '\0';
 487     *(path + length - 1) = 'm';
 488     char **filenames = sourcefile->filenames;
 489     if (filenames) {
 490       int i;
 491       for (i = 0; filenames[i] != NULL; i++)
 492         if (strcmp(path, filenames[i]) == 0)
 493           return LANG_OBJECTIVE_C;
 494     }
 495   }
 496
 497   // Attempt to detect based on file contents.
 498   char line[81], buf[81];
 499   bof = ohcount_sourcefile_get_contents(sourcefile);
 500   p = bof;
 501   pe = p;
 502   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 503   while (pe < eof) {
 504     // Get a line at a time.
 505     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 506     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 507     strncpy(line, p, length);
 508     line[length] = '\0';
 509     char *eol = line + strlen(line);
 510     char *line_end = pe;
 511
 512     // Look for C++ headers.
 513     if (*line == '#') {
 514       p = line + 1;
 515       while (*p == ' ' || *p == '\t') p++;
 516       if (strncmp(p, "include", 7) == 0 &&
 517           (*(p + 7) == ' ' || *(p + 7) == '\t')) {
 518         // /^#\s*include\s+[<"][^>"]+[>"]/
 519         p += 8;
 520         while (*p == ' ' || *p == '\t') p++;
 521         if (*p == '<' || *p == '"') {
 522           // Is the header file a C++ header file?
 523           p++;
 524           pe = p;
 525           while (pe < eol && *pe != '>' && *pe != '"') pe++;
 526           length = pe - p;
 527           strncpy(buf, p, length);
 528           buf[length] = '\0';
 529           if (ohcount_hash_is_cppheader(buf, length))
 530             return LANG_CPP;
 531           // Is the extension for the header file a C++ file?
 532           p = pe;
 533           while (p > line && *(p - 1) != '.') p--;
 534           length = pe - p;
 535           strncpy(buf, p, length);
 536           buf[length] = '\0';
 537           struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 538           if (re && strcmp(re->value, LANG_CPP) == 0)
 539             return LANG_CPP;
 540         }
 541       }
 542     }
 543
 544     // Look for C++ keywords.
 545     p = line;
 546     while (p < eol) {
 547       if (islower(*p) && p != bof && !isalnum(*(p - 1)) && *(p - 1) != '_') {
 548         pe = p;
 549         while (islower(*pe)) pe++;
 550         if (!isalnum(*pe) && *pe != '_') {
 551           length = pe - p;
 552           strncpy(buf, p, length);
 553           buf[length] = '\0';
 554           if (strcmp(buf, "class") == 0 ||
 555               strcmp(buf, "namespace") == 0 ||
 556               strcmp(buf, "template") == 0 ||
 557               strcmp(buf, "typename") == 0)
 558             return LANG_CPP;
 559         }
 560         p = pe + 1;
 561       } else p++;
 562     }
 563
 564     // Next line.
 565     pe = line_end;
 566     while (*pe == '\r' || *pe == '\n') pe++;
 567     p = pe;
 568   }
 569
 570   // Nothing to suggest C++.
 571   return LANG_C;
 572 }
 573
 574 const char *disambiguate_in(SourceFile *sourcefile) {
 575   char *p, *pe;
 576   int length;
 577   const char *language = NULL;
 578
 579   p = sourcefile->filepath;
 580   pe = p + strlen(p) - 3;
 581   if (strstr(p, ".") <= pe) {
 582     // Only if the filename has an extension prior to the .in
 583     length = pe - p;
 584     char buf[length];
 585     strncpy(buf, p, length);
 586     buf[length] = '\0';
 587     p = ohcount_sourcefile_get_contents(sourcefile);
 588                 if (!p) {
 589                         return NULL;
 590                 }
 591
 592     // A SourceFile's filepath and diskpath need not be the same.
 593     // Here, we'll take advantage of this to set up a new SourceFile
 594     // whose filepath does not have the *.in extension, but whose
 595     // diskpath still points back to the original file on disk (if any).
 596     SourceFile *undecorated = ohcount_sourcefile_new(buf);
 597     if (sourcefile->diskpath) {
 598       ohcount_sourcefile_set_diskpath(undecorated, sourcefile->diskpath);
 599     }
 600     ohcount_sourcefile_set_contents(undecorated, p);
 601                 undecorated->filenames = sourcefile->filenames;
 602     language = ohcount_sourcefile_get_language(undecorated);
 603     ohcount_sourcefile_free(undecorated);
 604   }
 605   return language;
 606 }
 607
 608 const char *disambiguate_inc(SourceFile *sourcefile) {
 609   char *p = ohcount_sourcefile_get_contents(sourcefile);
 610         if (p) {
 611                 char *eof = p + strlen(p);
 612                 while (p < eof) {
 613                         if (*p == '\0')
 614                                 return BINARY;
 615                         else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
 616                                 return LANG_PHP;
 617                         p++;
 618                 }
 619         }
 620   return NULL;
 621 }
 622
 623 const char *disambiguate_m(SourceFile *sourcefile) {
 624   char *p, *pe;
 625   int length;
 626
 627   // Attempt to detect based on a weighted heuristic of file contents.
 628   int matlab_score = 0;
 629   int objective_c_score = 0;
 630   int limbo_score = 0;
 631   int octave_syntax_detected = 0;
 632
 633   int i, has_h_headers = 0, has_c_files = 0;
 634   char **filenames = sourcefile->filenames;
 635   if (filenames) {
 636     for (i = 0; filenames[i] != NULL; i++) {
 637       p = filenames[i];
 638       pe = p + strlen(p);
 639       if (pe - p >= 4) {
 640         if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
 641             ((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
 642              (*(pe - 2) == '+' && *(pe - 1) == '+') ||
 643              (*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
 644           has_c_files = 1;
 645           break; // short circuit
 646         }
 647       } else if (pe - p >= 3) {
 648         if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
 649           has_c_files = 1;
 650           break; // short circuit
 651         }
 652       } else if (pe - p >= 2) {
 653         if (*(pe - 2) == '.') {
 654           if (*(pe - 1) == 'h')
 655             has_h_headers = 1;
 656           else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
 657             has_c_files = 1;
 658             break; // short circuit
 659           }
 660         }
 661       }
 662     }
 663   }
 664   if (has_h_headers && !has_c_files)
 665     objective_c_score += 5;
 666
 667   char line[81], buf[81];
 668   p = ohcount_sourcefile_get_contents(sourcefile);
 669   pe = p;
 670   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 671   while (pe < eof) {
 672     // Get a line at a time.
 673     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 674     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 675     strncpy(line, p, length);
 676     line[length] = '\0';
 677     char *eol = line + strlen(line);
 678     char *line_end = pe;
 679
 680     // Look for tell-tale lines.
 681     p = line;
 682     while (*p == ' ' || *p == '\t') p++;
 683     if (*p == '%') { // Matlab comment
 684       matlab_score++;
 685                 } else if (*p == '#' && strncmp(p, "#import", 7) == 0) { // Objective C
 686                         objective_c_score++;
 687     } else if (*p == '#') { // Limbo or Octave comment
 688       while (*p == '#') p++;
 689       if (*p == ' ' || *p == '\t') {
 690         limbo_score++;
 691         matlab_score++;
 692         octave_syntax_detected = 1;
 693       }
 694     } else if (*p == '/' && *(p + 1) == '/' || *(p + 1) == '*') {
 695       objective_c_score++; // Objective C comment
 696     } else if (*p == '+' || *p == '-') { // Objective C method signature
 697       objective_c_score++;
 698     } else if (*p == '@' || *p == '#') { // Objective C method signature
 699       if (strncmp(p, "@implementation", 15) == 0 ||
 700           strncmp(p, "@interface", 10) == 0)
 701         objective_c_score++;
 702     } else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
 703       p += 8;
 704       while (*p == ' ' || *p == '\t') p++;
 705       if (*p == '(')
 706         matlab_score++;
 707     } else if (strncmp(p, "include", 7) == 0) { // Limbo include
 708       // /^include[ \t]+"[^"]+\.m";/
 709       p += 7;
 710       if (*p == ' ' || *p == '\t') {
 711         while (*p == ' ' || *p == '\t') p++;
 712         if (*p == '"') {
 713           while (*p != '"' && p < eol) p++;
 714           if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
 715             limbo_score++;
 716         }
 717       }
 718     }
 719
 720     // Look for Octave keywords.
 721     p = line;
 722     while (p < eol) {
 723       if (islower(*p) && p != line && !isalnum(*(p - 1))) {
 724         pe = p;
 725         while (islower(*pe) || *pe == '_') pe++;
 726         if (!isalnum(*pe)) {
 727           length = pe - p;
 728           strncpy(buf, p, length);
 729           buf[length] = '\0';
 730           if (strcmp(buf, "end_try_catch") == 0 ||
 731               strcmp(buf, "end_unwind_protect") == 0 ||
 732               strcmp(buf, "endfunction") == 0 ||
 733               strcmp(buf, "endwhile") == 0)
 734             octave_syntax_detected = 1;
 735         }
 736         p = pe + 1;
 737       } else p++;
 738     }
 739
 740     // Look for Limbo declarations
 741     p = line;
 742     while (p < eol) {
 743       if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
 744         // /:[ \t]+(module|adt|fn ?\(|con[ \t])/
 745         p += 2;
 746         if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
 747             strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
 748             strncmp(p, "fn", 2) == 0 &&
 749               (*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
 750             strncmp(p, "con", 3) == 0 &&
 751               (*(p + 3) == ' ' || *(p + 3) == '\t'))
 752           limbo_score++;
 753       } else p++;
 754     }
 755
 756     // Next line.
 757     pe = line_end;
 758     while (*pe == '\r' || *pe == '\n') pe++;
 759     p = pe;
 760   }
 761
 762   if (limbo_score > objective_c_score && limbo_score > matlab_score)
 763     return LANG_LIMBO;
 764   else if (objective_c_score > matlab_score)
 765     return LANG_OBJECTIVE_C;
 766   else
 767     return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
 768 }
 769
 770 #include <pcre.h>
 771
 772 // strnlen is not available on OS X, so we roll our own
 773 size_t mystrnlen(const char *begin, size_t maxlen) {
 774   const char *end = memchr(begin, '\0', maxlen);
 775   return end ? (end - begin) : maxlen;
 776 }
 777
 778 const char *disambiguate_pp(SourceFile *sourcefile) {
 779         char *p = ohcount_sourcefile_get_contents(sourcefile);
 780   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 781
 782         /* prepare regular expressions */
 783         pcre *re;
 784         const char *error;
 785         int erroffset;
 786         re = pcre_compile("(define\\s+\\w+\\s*\\(|class \\s+\\w+\\s*{)", 0, &error, &erroffset, NULL);
 787
 788         for (; p < eof; p++) {
 789                 if (strncmp(p, "$include", 8) == 0 ||
 790                                 strncmp(p, "$INCLUDE", 8) == 0 ||
 791                                 strncmp(p, "end.", 4) == 0)
 792                         return LANG_PASCAL;
 793                 if (strncmp(p, "enable =>", 9) == 0 ||
 794                                 strncmp(p, "ensure =>", 9) == 0 ||
 795                                 strncmp(p, "content =>", 10) == 0 ||
 796                                 strncmp(p, "source =>", 9) == 0 ||
 797                                 strncmp(p, "include ", 8) == 0)
 798                         return LANG_PUPPET;
 799
 800                 /* regexp for checking for define and class declarations */
 801
 802                 int rc;
 803                 int ovector[30];
 804                 rc = pcre_exec(re, NULL, p, mystrnlen(p, 100), 0, 0, ovector, 30);
 805                 if(rc > 0) {
 806                         return LANG_PUPPET;
 807                 }
 808
 809         }
 810         return LANG_PASCAL;
 811 }
 812
 813 const char *disambiguate_pl(SourceFile *sourcefile) {
 814         char *contents = ohcount_sourcefile_get_contents(sourcefile);
 815   if (!contents)
 816     return NULL;
 817
 818   // Check for a perl shebang on first line of file
 819         const char *error;
 820         int erroffset;
 821         pcre *re = pcre_compile("#![^\\n]*perl", PCRE_CASELESS, &error, &erroffset, NULL);
 822   if (pcre_exec(re, NULL, contents, mystrnlen(contents, 100), 0, PCRE_ANCHORED, NULL, 0) > -1)
 823     return LANG_PERL;
 824
 825   // Check for prolog :- rules
 826   if (strstr(contents, ":- ") || strstr(contents, ":-\n"))
 827     return LANG_PROLOG;
 828
 829   // Perl by default.
 830   return LANG_PERL;
 831 }
 832
 833 #define QMAKE_SOURCES_SPACE "SOURCES +="
 834 #define QMAKE_SOURCES "SOURCES+="
 835 #define QMAKE_CONFIG_SPACE "CONFIG +="
 836 #define QMAKE_CONFIG "CONFIG+="
 837
 838 const char *disambiguate_pro(SourceFile *sourcefile) {
 839         char *p = ohcount_sourcefile_get_contents(sourcefile);
 840   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 841         for (; p < eof; p++) {
 842                 if (strncmp(p, QMAKE_SOURCES_SPACE, strlen(QMAKE_SOURCES_SPACE)) == 0 ||
 843                                 strncmp(p, QMAKE_SOURCES, strlen(QMAKE_SOURCES)) == 0 ||
 844                                 strncmp(p, QMAKE_CONFIG_SPACE, strlen(QMAKE_CONFIG_SPACE)) == 0 ||
 845                                 strncmp(p, QMAKE_CONFIG, strlen(QMAKE_CONFIG)) == 0)
 846                         return LANG_MAKE; // really QMAKE
 847         }
 848         return LANG_IDL_PVWAVE;
 849 }
 850
 851 const char *disambiguate_r(SourceFile *sourcefile) {
 852   char *contents = ohcount_sourcefile_get_contents(sourcefile);
 853   if (!contents)
 854     return LANG_R;
 855
 856   char *eof = contents + ohcount_sourcefile_get_contents_size(sourcefile);
 857
 858   // Detect REBOL by looking for the occurence of "rebol" in the contents
 859   // (case-insensitive). Correct REBOL scripts have a "REBOL [...]" header
 860   // block.
 861   char *needle = "rebol";
 862   int len = strlen(needle);
 863   for (; contents < eof - len; ++contents)
 864     if (tolower(*contents) == *needle &&
 865           !strncasecmp(contents, needle, len))
 866       return LANG_REBOL;
 867
 868   return LANG_R;
 869 }
 870
 871 const char *disambiguate_st(SourceFile *sourcefile) {
 872   char *p, *pe;
 873   int length;
 874
 875   // Attempt to detect based on file contents.
 876   int found_assignment = 0, found_block_start = 0, found_block_end = 0;
 877
 878   char line[81];
 879   p = ohcount_sourcefile_get_contents(sourcefile);
 880   pe = p;
 881   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 882   while (pe < eof) {
 883     // Get a line at a time.
 884     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 885     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 886     strncpy(line, p, length);
 887     line[length] = '\0';
 888     char *eol = line + strlen(line);
 889     char *line_end = pe;
 890
 891     for (p = line; p < eol; p++) {
 892       if (*p == ':') {
 893         p++;
 894         while (p < eol && (*p == ' ' || *p == '\t')) p++;
 895         if (*p == '=')
 896           found_assignment = 1;
 897         else if (*p == '[')
 898           found_block_start = 1;
 899       } else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
 900       if (found_assignment && found_block_start && found_block_end)
 901         return LANG_SMALLTALK;
 902     }
 903
 904     // Next line.
 905     pe = line_end;
 906     while (*pe == '\r' || *pe == '\n') pe++;
 907     p = pe;
 908   }
 909
 910   return NULL;
 911 }
 912
 913 int ohcount_is_binary_filename(const char *filename) {
 914   char *p = (char *)filename + strlen(filename);
 915   while (p > filename && *(p - 1) != '.') p--;
 916   if (p > filename) {
 917     struct ExtensionMap *re;
 918     int length = strlen(p);
 919     re = ohcount_hash_language_from_ext(p, length);
 920     if (re) return ISBINARY(re->value);
 921     // Try the lower-case version of this extension.
 922     char lowerext[length];
 923     strncpy(lowerext, p, length);
 924     lowerext[length] = '\0';
 925     for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
 926     re = ohcount_hash_language_from_ext(lowerext, length);
 927     if (re) return ISBINARY(re->value);
 928   }
 929   return 0;
 930 }