git.oblomov.eu Git - ohcount/blob - src/detector.c

   1 // detector.c written by Mitchell Foral. mitchell<att>caladbolg.net.
   2 // See COPYING for license information.
   3
   4 #include <ctype.h>
   5 #include <stdio.h>
   6 #include <stdlib.h>
   7 #include <string.h>
   8 #include <unistd.h>
   9
  10 #include "detector.h"
  11 #include "languages.h"
  12 #include "log.h"
  13
  14 #include "hash/cppheader_hash.h"
  15 #include "hash/disambiguatefunc_hash.h"
  16 #include "hash/extension_hash.h"
  17 #include "hash/filename_hash.h"
  18
  19 #define ISBINARY(x) (x[0] == '\1')
  20 #define ISAMBIGUOUS(x) (x[0] == '\2')
  21 #define DISAMBIGUATEWHAT(x) &x[1]
  22
  23 #ifdef _WIN32
  24 # include <fcntl.h>
  25 # define mkstemp(p) _open(_mktemp(p), _O_CREAT | _O_SHORT_LIVED | _O_EXCL)
  26 #endif
  27
  28 /* Replaces single quotes (') with an escape sequence ('\'')
  29  * suitable for use on the command line.
  30  */
  31 void escape_path(char *safe, const char *unsafe) {
  32   do {
  33     switch (*unsafe) {
  34     case  '\'':
  35       *safe++ = '\'';
  36       *safe++ = '\\';
  37       *safe++ = '\'';
  38       *safe++ = '\'';
  39       break;
  40     default:
  41       *safe++ = *unsafe;
  42       break;
  43     }
  44   } while (*unsafe++);
  45 }
  46
  47 const char *ohcount_detect_language(SourceFile *sourcefile) {
  48   const char *language = NULL;
  49   char *p, *pe;
  50   int length;
  51
  52   // Attempt to detect using Emacs mode line (/^-\*-\s*mode[\s:]*\w/i).
  53   char line[81] = { '\0' }, buf[81];
  54   p = ohcount_sourcefile_get_contents(sourcefile);
  55   pe = p;
  56   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
  57   while (pe < eof) {
  58     // Get the contents of the first line.
  59     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
  60     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
  61     strncpy(line, p, length);
  62     line[length] = '\0';
  63     if (*line == '#' && *(line + 1) == '!') {
  64       // First line was sh-bang; loop to get contents of second line.
  65       while (*pe == '\r' || *pe == '\n') pe++;
  66       p = pe;
  67     } else break;
  68   }
  69   p = strstr(line, "-*-");
  70   if (p) {
  71     p += 3;
  72     while (*p == ' ' || *p == '\t') p++;
  73     // detect "mode" (any capitalization)
  74     if (strncasecmp(p, "mode", 4) == 0) {
  75       p += 4;
  76       while (*p == ' ' || *p == '\t' || *p == ':') p++;
  77     }
  78     pe = p;
  79     while (!isspace(*pe) && *pe != ';' && pe != strstr(pe, "-*-")) pe++;
  80     length = (pe - p <= sizeof(buf)) ? pe - p : sizeof(buf);
  81     strncpy(buf, p, length);
  82     buf[length] = '\0';
  83
  84                 // Special case for "c" or "C" emacs mode header: always means C, not C++
  85                 if (strcasecmp(buf, "c") == 0) {
  86                                 return LANG_C;
  87                 }
  88
  89     // First try it with the language name.
  90     struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
  91     if (rl) language = rl->name;
  92     if(!language) {
  93       // Then try it with the extension table.
  94       struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
  95       if (re) language = re->value;
  96     }
  97     if (!language) {
  98       // Try the lower-case version of this modeline.
  99       for (pe = buf; pe < buf+length; pe++) *pe = tolower(*pe);
 100       // First try it with the language name.
 101       rl = ohcount_hash_language_from_name(buf, length);
 102       if (rl) language = rl->name;
 103     }
 104     if (!language) {
 105       // Then try it with the extension table.
 106       struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 107       if (re) language = re->value;
 108     }
 109   }
 110
 111   // Attempt to detect based on file extension.
 112   if(!language) {
 113       length = strlen(sourcefile->ext);
 114       struct ExtensionMap *re = ohcount_hash_language_from_ext(sourcefile->ext,
 115                                                                length);
 116       if (re) language = re->value;
 117     if (!language) {
 118       // Try the lower-case version of this extension.
 119       char lowerext[length + 1];
 120       strncpy(lowerext, sourcefile->ext, length);
 121       lowerext[length] = '\0';
 122       for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
 123       struct ExtensionMap *re = ohcount_hash_language_from_ext(lowerext, length);
 124       if (re) language = re->value;
 125     }
 126   }
 127
 128   // Attempt to detect based on filename.
 129   if(!language) {
 130     length = strlen(sourcefile->filename);
 131     struct FilenameMap *rf =
 132       ohcount_hash_language_from_filename(sourcefile->filename, length);
 133     if (rf) language = rf->value;
 134   }
 135
 136   // Attempt to detect based on Unix 'file' command.
 137   if(!language) {
 138     int tmpfile = 0;
 139     char *path = sourcefile->filepath;
 140     if (sourcefile->diskpath)
 141       path = sourcefile->diskpath;
 142     if (access(path, F_OK) != 0) { // create temporary file
 143       path = malloc(21);
 144       strncpy(path, "/tmp/ohcount_XXXXXXX\0", 21);
 145       int fd = mkstemp(path);
 146       char *contents = ohcount_sourcefile_get_contents(sourcefile);
 147       log_it("contents:");
 148       log_it(contents);
 149       length = contents ? strlen(contents) : 0;
 150       if (write(fd, contents, length) != length) {
 151         fprintf(stderr, "src/detector.c: Could not write temporary file %s.\n", path);
 152         exit(1);
 153       }
 154       close(fd);
 155       tmpfile = 1;
 156     }
 157
 158     /* Filenames may include single quotes, which must be escaped */
 159     char escaped_path[strlen(path) * 4 + 1];
 160     escape_path(escaped_path, path);
 161
 162     char command[strlen(escaped_path) + 11];
 163     sprintf(command, "file -b '%s'", escaped_path);
 164     FILE *f = popen(command, "r");
 165     if (f) {
 166       if (fgets(line, sizeof(line), f) == NULL) {
 167         fprintf(stderr, "src/detector.c: fgets() failed\n");
 168         exit(1);
 169       }
 170       char *eol = line + strlen(line);
 171       for (p = line; p < eol; p++) *p = tolower(*p);
 172       p = strstr(line, "script text");
 173       if (p && p == line) { // /^script text(?: executable)? for \w/
 174         p = strstr(line, "for ");
 175         if (p) {
 176           p += 4;
 177           pe = p;
 178           while (isalnum(*pe)) pe++;
 179           length = pe - p;
 180           strncpy(buf, p, length);
 181           buf[length] = '\0';
 182           struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
 183           if (rl) language = rl->name;
 184         }
 185       } else if (p) { // /(\w+)(?: -\w+)* script text/
 186         do {
 187           p--;
 188           pe = p;
 189           while (*p == ' ') p--;
 190           while (p != line && isalnum(*(p - 1))) p--;
 191           if (p != line && *(p - 1) == '-') p--;
 192         } while (*p == '-'); // Skip over any switches.
 193         length = pe - p;
 194         strncpy(buf, p, length);
 195         buf[length] = '\0';
 196         struct LanguageMap *rl = ohcount_hash_language_from_name(buf, length);
 197         if (rl) language = rl->name;
 198       } else if (strstr(line, "xml")) language = LANG_XML;
 199       pclose(f);
 200       if (tmpfile) {
 201         remove(path);
 202         free(path);
 203       }
 204     }
 205   }
 206   if (language) {
 207     if (ISAMBIGUOUS(language)) {
 208       // Call the appropriate function for disambiguation.
 209       length = strlen(DISAMBIGUATEWHAT(language));
 210       struct DisambiguateFuncsMap *rd =
 211         ohcount_hash_disambiguate_func_from_id(DISAMBIGUATEWHAT(language),
 212                                                length);
 213       if (rd) language = rd->value(sourcefile);
 214     } else language = ISBINARY(language) ? NULL : language;
 215   }
 216   return language;
 217 }
 218
 219 const char *disambiguate_aspx(SourceFile *sourcefile) {
 220   char *p = ohcount_sourcefile_get_contents(sourcefile);
 221   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 222   for (; p < eof; p++) {
 223     // /<%@\s*Page[^>]+Language="VB"[^>]+%>/
 224     p = strstr(p, "<%@");
 225     if (!p)
 226                         break;
 227     char *pe = strstr(p, "%>");
 228     if (p && pe) {
 229       p += 3;
 230       const int length = pe - p;
 231       char buf[length];
 232       strncpy(buf, p, length);
 233       buf[length] = '\0';
 234       char *eol = buf + strlen(buf);
 235       for (p = buf; p < eol; p++) *p = tolower(*p);
 236       p = buf;
 237       while (*p == ' ' || *p == '\t') p++;
 238       if (strncmp(p, "page", 4) == 0) {
 239         p += 4;
 240         if (strstr(p, "language=\"vb\""))
 241           return LANG_VB_ASPX;
 242       }
 243     }
 244   }
 245   return LANG_CS_ASPX;
 246 }
 247
 248 // 6502 assembly or XML-based Advanced Stream Redirector ?
 249 const char *disambiguate_asx(SourceFile *sourcefile) {
 250   char *p = ohcount_sourcefile_get_contents(sourcefile);
 251   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 252   for (; p < eof; p++) {
 253     switch (*p) {
 254     case ' ':
 255     case '\t':
 256     case '\n':
 257     case '\r':
 258       break;
 259     case '<':
 260     case '\0':
 261     // byte-order marks:
 262     case (char) 0xef:
 263     case (char) 0xfe:
 264     case (char) 0xff:
 265       return NULL; // XML
 266     default:
 267       return LANG_ASSEMBLER;
 268     }
 269   }
 270   return LANG_ASSEMBLER; // only blanks - not valid XML, may be valid asm
 271 }
 272
 273 const char *disambiguate_b(SourceFile *sourcefile) {
 274   char *p = ohcount_sourcefile_get_contents(sourcefile);
 275   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 276   while (p < eof) {
 277     // /(implement[ \t])|(include[ \t]+"[^"]*";)|
 278     //  ((return|break|continue).*;|(pick|case).*\{)/
 279     if (strncmp(p, "implement", 9) == 0 &&
 280         (*(p + 9) == ' ' || *(p + 9) == '\t'))
 281       return LANG_LIMBO;
 282     else if (strncmp(p, "include", 7) == 0 &&
 283         (*(p + 7) == ' ' || *(p + 7) == '\t')) {
 284       p += 7;
 285       while (*p == ' ' || *p == '\t') p++;
 286       if (*p == '"') {
 287         while (*p != '"' && p < eof) p++;
 288         if (*p == '"' && *(p + 1) == ';')
 289           return LANG_LIMBO;
 290       }
 291     } else if (strncmp(p, "return", 6) == 0 ||
 292                strncmp(p, "break", 5) == 0 ||
 293                strncmp(p, "continue", 8) == 0) {
 294       if (strstr(p, ";"))
 295         return LANG_LIMBO;
 296     } else if (strncmp(p, "pick", 4) == 0 ||
 297                strncmp(p, "case", 4) == 0) {
 298       if (strstr(p, "{"))
 299         return LANG_LIMBO;
 300     }
 301     p++;
 302   }
 303   return disambiguate_basic(sourcefile);
 304 }
 305
 306 const char *disambiguate_basic(SourceFile *sourcefile) {
 307   char *p, *pe;
 308   int length;
 309
 310   // Attempt to detect based on file contents.
 311   char line[81];
 312   p = ohcount_sourcefile_get_contents(sourcefile);
 313   pe = p;
 314   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 315   while (pe < eof) {
 316     // Get a line at a time.
 317     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 318     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 319     strncpy(line, p, length);
 320     line[length] = '\0';
 321     char *line_end = pe;
 322
 323     p = line;
 324     if (isdigit(*p)) {
 325       // /^\d+\s+\w/
 326       p++;
 327       while (isdigit(*p)) p++;
 328       if (*p == ' ' || *p == '\t') {
 329         p++;
 330         while (*p == ' ' || *p == '\t') p++;
 331         if (isalnum(*p))
 332           return LANG_CLASSIC_BASIC;
 333       }
 334     }
 335
 336     // Next line.
 337     pe = line_end;
 338     while (*pe == '\r' || *pe == '\n') pe++;
 339     p = pe;
 340   }
 341
 342   // Attempt to detect from associated VB files in file context.
 343   char **filenames = sourcefile->filenames;
 344   if (filenames) {
 345     int i;
 346     for (i = 0; filenames[i] != NULL; i++) {
 347       pe = filenames[i] + strlen(filenames[i]);
 348       p = pe;
 349       while (p > filenames[i] && *(p - 1) != '.') p--;
 350       length = pe - p;
 351       if (length == 3 &&
 352           (strncmp(p, "frm", length) == 0 ||
 353            strncmp(p, "frx", length) == 0 ||
 354            strncmp(p, "vba", length) == 0 ||
 355            strncmp(p, "vbp", length) == 0 ||
 356            strncmp(p, "vbs", length) == 0)) {
 357         return LANG_VISUALBASIC;
 358       }
 359     }
 360   }
 361
 362   return LANG_STRUCTURED_BASIC;
 363 }
 364
 365 const char *disambiguate_cs(SourceFile *sourcefile) {
 366   // Attempt to detect based on file contents.
 367         char *contents = ohcount_sourcefile_get_contents(sourcefile);
 368   if (contents && strstr(contents, "<?cs"))
 369     return LANG_CLEARSILVER_TEMPLATE;
 370   else
 371     return LANG_CSHARP;
 372 }
 373
 374 const char *disambiguate_def(SourceFile *sourcefile) {
 375   char *p = ohcount_sourcefile_get_contents(sourcefile);
 376   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 377   for (; p < eof; p++) {
 378     switch (*p) {
 379     case ' ':
 380     case '\t':
 381     case '\n':
 382     case '\r':
 383       break;
 384     case '(':
 385       if (p[1] == '*') // Modula-2 comment
 386         return LANG_MODULA2;
 387       return NULL;
 388     case 'D':
 389       if (strncmp(p, "DEFINITION", 10) == 0) // Modula-2 "DEFINITION MODULE"
 390         return LANG_MODULA2;
 391       return NULL;
 392     default:
 393       return NULL; // not Modula-2
 394     }
 395   }
 396   return NULL; // only blanks
 397 }
 398
 399 const char *disambiguate_fortran(SourceFile *sourcefile) {
 400   char *p, *pe;
 401
 402   p = ohcount_sourcefile_get_contents(sourcefile);
 403   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 404   while (p < eof) {
 405     if (*p == ' ' && p + 5 < eof) {
 406       int i;
 407       for (i = 1; i <= 5; i++)
 408         if (!isdigit(*(p + i)) && *(p + i) != ' ')
 409           return LANG_FORTRANFREE; // definately not fixed
 410       // Possibly fixed (doesn't match /^\s*\d+\s*$/).
 411       pe = p;
 412       while (*pe == ' ' || *pe == '\t') pe++;
 413       if (pe - p <= 5) {
 414         if (!isdigit(*pe))
 415           return LANG_FORTRANFIXED;
 416         while (isdigit(*pe)) pe++;
 417         while (*pe == ' ' || *pe == '\t') pe++;
 418         if (*pe != '\r' && *pe != '\n' && pe - p == 5)
 419           return LANG_FORTRANFIXED;
 420       }
 421     }
 422     while (*p != '\r' && *p != '\n' && *p != '&' && p < eof) p++;
 423     if (*p == '&') {
 424       p++;
 425       // Look for free-form continuation.
 426       while (*p == ' ' || *p == '\t') p++;
 427       if (*p == '\r' || *p == '\n') {
 428         pe = p;
 429         while (*pe == '\r' || *pe == '\n' || *pe == ' ' || *pe == '\t') pe++;
 430         if (*pe == '&')
 431           return LANG_FORTRANFREE;
 432       }
 433     }
 434     while (*p == '\r' || *p == '\n') p++;
 435   }
 436   return LANG_FORTRANFREE; // might as well be free-form
 437 }
 438
 439 const char *disambiguate_h(SourceFile *sourcefile) {
 440   char *p, *pe, *bof;
 441   int length;
 442
 443   // If the directory contains a matching *.m file, likely Objective C.
 444   length = strlen(sourcefile->filename);
 445   if (strcmp(sourcefile->ext, "h") == 0) {
 446     char path[length];
 447     strncpy(path, sourcefile->filename, length);
 448     path[length] = '\0';
 449     *(path + length - 1) = 'm';
 450     char **filenames = sourcefile->filenames;
 451     if (filenames) {
 452       int i;
 453       for (i = 0; filenames[i] != NULL; i++)
 454         if (strcmp(path, filenames[i]) == 0)
 455           return LANG_OBJECTIVE_C;
 456     }
 457   }
 458
 459   // Attempt to detect based on file contents.
 460   char line[81], buf[81];
 461   bof = ohcount_sourcefile_get_contents(sourcefile);
 462   p = bof;
 463   pe = p;
 464   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 465   while (pe < eof) {
 466     // Get a line at a time.
 467     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 468     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 469     strncpy(line, p, length);
 470     line[length] = '\0';
 471     char *eol = line + strlen(line);
 472     char *line_end = pe;
 473
 474     // Look for C++ headers.
 475     if (*line == '#') {
 476       p = line + 1;
 477       while (*p == ' ' || *p == '\t') p++;
 478       if (strncmp(p, "include", 7) == 0 &&
 479           (*(p + 7) == ' ' || *(p + 7) == '\t')) {
 480         // /^#\s*include\s+[<"][^>"]+[>"]/
 481         p += 8;
 482         while (*p == ' ' || *p == '\t') p++;
 483         if (*p == '<' || *p == '"') {
 484           // Is the header file a C++ header file?
 485           p++;
 486           pe = p;
 487           while (pe < eol && *pe != '>' && *pe != '"') pe++;
 488           length = pe - p;
 489           strncpy(buf, p, length);
 490           buf[length] = '\0';
 491           if (ohcount_hash_is_cppheader(buf, length))
 492             return LANG_CPP;
 493           // Is the extension for the header file a C++ file?
 494           p = pe;
 495           while (p > line && *(p - 1) != '.') p--;
 496           length = pe - p;
 497           strncpy(buf, p, length);
 498           buf[length] = '\0';
 499           struct ExtensionMap *re = ohcount_hash_language_from_ext(buf, length);
 500           if (re && strcmp(re->value, LANG_CPP) == 0)
 501             return LANG_CPP;
 502         }
 503       }
 504     }
 505
 506     // Look for C++ keywords.
 507     p = line;
 508     while (p < eol) {
 509       if (islower(*p) && p != bof && !isalnum(*(p - 1)) && *(p - 1) != '_') {
 510         pe = p;
 511         while (islower(*pe)) pe++;
 512         if (!isalnum(*pe) && *pe != '_') {
 513           length = pe - p;
 514           strncpy(buf, p, length);
 515           buf[length] = '\0';
 516           if (strcmp(buf, "class") == 0 ||
 517               strcmp(buf, "namespace") == 0 ||
 518               strcmp(buf, "template") == 0 ||
 519               strcmp(buf, "typename") == 0)
 520             return LANG_CPP;
 521         }
 522         p = pe + 1;
 523       } else p++;
 524     }
 525
 526     // Next line.
 527     pe = line_end;
 528     while (*pe == '\r' || *pe == '\n') pe++;
 529     p = pe;
 530   }
 531
 532   // Nothing to suggest C++.
 533   return LANG_C;
 534 }
 535
 536 const char *disambiguate_in(SourceFile *sourcefile) {
 537   char *p, *pe;
 538   int length;
 539   const char *language = NULL;
 540
 541   p = sourcefile->filepath;
 542   pe = p + strlen(p) - 3;
 543   if (strstr(p, ".") <= pe) {
 544     // Only if the filename has an extension prior to the .in
 545     length = pe - p;
 546     char buf[length];
 547     strncpy(buf, p, length);
 548     buf[length] = '\0';
 549     SourceFile *undecorated = ohcount_sourcefile_new(buf);
 550     p = ohcount_sourcefile_get_contents(sourcefile);
 551                 if (!p) {
 552                         return NULL;
 553                 }
 554     // The filepath without the '.in' extension does not exist on disk. The
 555     // sourcefile->diskpath field must be set incase the detector needs to run
 556     // 'file -b' on the file.
 557     ohcount_sourcefile_set_diskpath(undecorated, sourcefile->filepath);
 558     ohcount_sourcefile_set_contents(undecorated, p);
 559                 undecorated->filenames = sourcefile->filenames;
 560     language = ohcount_sourcefile_get_language(undecorated);
 561     ohcount_sourcefile_free(undecorated);
 562   }
 563   return language;
 564 }
 565
 566 const char *disambiguate_inc(SourceFile *sourcefile) {
 567   char *p = ohcount_sourcefile_get_contents(sourcefile);
 568         if (p) {
 569                 char *eof = p + strlen(p);
 570                 while (p < eof) {
 571                         if (*p == '\0')
 572                                 return BINARY;
 573                         else if (*p == '?' && strncmp(p + 1, "php", 3) == 0)
 574                                 return LANG_PHP;
 575                         p++;
 576                 }
 577         }
 578   return NULL;
 579 }
 580
 581 const char *disambiguate_m(SourceFile *sourcefile) {
 582   char *p, *pe;
 583   int length;
 584
 585   // Attempt to detect based on a weighted heuristic of file contents.
 586   int matlab_score = 0;
 587   int objective_c_score = 0;
 588   int limbo_score = 0;
 589   int octave_syntax_detected = 0;
 590
 591   int i, has_h_headers = 0, has_c_files = 0;
 592   char **filenames = sourcefile->filenames;
 593   if (filenames) {
 594     for (i = 0; filenames[i] != NULL; i++) {
 595       p = filenames[i];
 596       pe = p + strlen(p);
 597       if (pe - p >= 4) {
 598         if (*(pe - 4) == '.' && *(pe - 3) == 'c' &&
 599             ((*(pe - 2) == 'p' && *(pe - 1) == 'p') ||
 600              (*(pe - 2) == '+' && *(pe - 1) == '+') ||
 601              (*(pe - 2) == 'x' && *(pe - 1) == 'x'))) {
 602           has_c_files = 1;
 603           break; // short circuit
 604         }
 605       } else if (pe - p >= 3) {
 606         if (*(pe - 3) == '.' && *(pe - 2) == 'c' && *(pe - 1) == 'c') {
 607           has_c_files = 1;
 608           break; // short circuit
 609         }
 610       } else if (pe - p >= 2) {
 611         if (*(pe - 2) == '.') {
 612           if (*(pe - 1) == 'h')
 613             has_h_headers = 1;
 614           else if (*(pe - 1) == 'c' || *(pe - 1) == 'C') {
 615             has_c_files = 1;
 616             break; // short circuit
 617           }
 618         }
 619       }
 620     }
 621   }
 622   if (has_h_headers && !has_c_files)
 623     objective_c_score += 5;
 624
 625   char line[81], buf[81];
 626   p = ohcount_sourcefile_get_contents(sourcefile);
 627   pe = p;
 628   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 629   while (pe < eof) {
 630     // Get a line at a time.
 631     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 632     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 633     strncpy(line, p, length);
 634     line[length] = '\0';
 635     char *eol = line + strlen(line);
 636     char *line_end = pe;
 637
 638     // Look for tell-tale lines.
 639     p = line;
 640     while (*p == ' ' || *p == '\t') p++;
 641     if (*p == '%') { // Matlab comment
 642       matlab_score++;
 643                 } else if (*p == '#' && strncmp(p, "#import", 7) == 0) { // Objective C
 644                         objective_c_score++;
 645     } else if (*p == '#') { // Limbo or Octave comment
 646       while (*p == '#') p++;
 647       if (*p == ' ' || *p == '\t') {
 648         limbo_score++;
 649         matlab_score++;
 650         octave_syntax_detected = 1;
 651       }
 652     } else if (*p == '/' && *(p + 1) == '/' || *(p + 1) == '*') {
 653       objective_c_score++; // Objective C comment
 654     } else if (*p == '+' || *p == '-') { // Objective C method signature
 655       objective_c_score++;
 656     } else if (*p == '@' || *p == '#') { // Objective C method signature
 657       if (strncmp(p, "@implementation", 15) == 0 ||
 658           strncmp(p, "@interface", 10) == 0)
 659         objective_c_score++;
 660     } else if (strncmp(p, "function", 8) == 0) { // Matlab or Octave function
 661       p += 8;
 662       while (*p == ' ' || *p == '\t') p++;
 663       if (*p == '(')
 664         matlab_score++;
 665     } else if (strncmp(p, "include", 7) == 0) { // Limbo include
 666       // /^include[ \t]+"[^"]+\.m";/
 667       p += 7;
 668       if (*p == ' ' || *p == '\t') {
 669         while (*p == ' ' || *p == '\t') p++;
 670         if (*p == '"') {
 671           while (*p != '"' && p < eol) p++;
 672           if (*p == '"' && *(p - 2) == '.' && *(p - 1) == 'm')
 673             limbo_score++;
 674         }
 675       }
 676     }
 677
 678     // Look for Octave keywords.
 679     p = line;
 680     while (p < eol) {
 681       if (islower(*p) && p != line && !isalnum(*(p - 1))) {
 682         pe = p;
 683         while (islower(*pe) || *pe == '_') pe++;
 684         if (!isalnum(*pe)) {
 685           length = pe - p;
 686           strncpy(buf, p, length);
 687           buf[length] = '\0';
 688           if (strcmp(buf, "end_try_catch") == 0 ||
 689               strcmp(buf, "end_unwind_protect") == 0 ||
 690               strcmp(buf, "endfunction") == 0 ||
 691               strcmp(buf, "endwhile") == 0)
 692             octave_syntax_detected = 1;
 693         }
 694         p = pe + 1;
 695       } else p++;
 696     }
 697
 698     // Look for Limbo declarations
 699     p = line;
 700     while (p < eol) {
 701       if (*p == ':' && (*(p + 1) == ' ' || *(p + 1) == '\t')) {
 702         // /:[ \t]+(module|adt|fn ?\(|con[ \t])/
 703         p += 2;
 704         if (strncmp(p, "module", 6) == 0 && !isalnum(*(p + 6)) ||
 705             strncmp(p, "adt", 3) == 0 && !isalnum(*(p + 3)) ||
 706             strncmp(p, "fn", 2) == 0 &&
 707               (*(p + 2) == ' ' && *(p + 3) == '(' || *(p + 2) == '(') ||
 708             strncmp(p, "con", 3) == 0 &&
 709               (*(p + 3) == ' ' || *(p + 3) == '\t'))
 710           limbo_score++;
 711       } else p++;
 712     }
 713
 714     // Next line.
 715     pe = line_end;
 716     while (*pe == '\r' || *pe == '\n') pe++;
 717     p = pe;
 718   }
 719
 720   if (limbo_score > objective_c_score && limbo_score > matlab_score)
 721     return LANG_LIMBO;
 722   else if (objective_c_score > matlab_score)
 723     return LANG_OBJECTIVE_C;
 724   else
 725     return octave_syntax_detected ? LANG_OCTAVE : LANG_MATLAB;
 726 }
 727
 728 #include <pcre.h>
 729
 730 // strnlen is not available on OS X, so we roll our own
 731 size_t mystrnlen(const char *begin, size_t maxlen) {
 732   const char *end = memchr(begin, '\0', maxlen);
 733   return end ? (end - begin) : maxlen;
 734 }
 735
 736 const char *disambiguate_pp(SourceFile *sourcefile) {
 737         char *p = ohcount_sourcefile_get_contents(sourcefile);
 738   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 739
 740         /* prepare regular expressions */
 741         pcre *re;
 742         const char *error;
 743         int erroffset;
 744         re = pcre_compile("(define\\s+\\w+\\s*\\(|class \\s+\\w+\\s*{)", 0, &error, &erroffset, NULL);
 745
 746         for (; p < eof; p++) {
 747                 if (strncmp(p, "$include", 8) == 0 ||
 748                                 strncmp(p, "$INCLUDE", 8) == 0 ||
 749                                 strncmp(p, "end.", 4) == 0)
 750                         return LANG_PASCAL;
 751                 if (strncmp(p, "enable =>", 9) == 0 ||
 752                                 strncmp(p, "ensure =>", 9) == 0 ||
 753                                 strncmp(p, "content =>", 10) == 0 ||
 754                                 strncmp(p, "source =>", 9) == 0 ||
 755                                 strncmp(p, "include ", 8) == 0)
 756                         return LANG_PUPPET;
 757
 758                 /* regexp for checking for define and class declarations */
 759
 760                 int rc;
 761                 int ovector[30];
 762                 rc = pcre_exec(re, NULL, p, mystrnlen(p, 100), 0, 0, ovector, 30);
 763                 if(rc > 0) {
 764                         return LANG_PUPPET;
 765                 }
 766
 767         }
 768         return LANG_PASCAL;
 769 }
 770
 771 const char *disambiguate_pl(SourceFile *sourcefile) {
 772   // Attempt to detect based on file contents.
 773         char *contents = ohcount_sourcefile_get_contents(sourcefile);
 774   if (contents && strstr(contents, "#!/usr/bin/perl"))
 775     return LANG_PERL;
 776   else if (contents && strstr(contents, "#!/usr/local/bin/perl"))
 777     return LANG_PERL;
 778   else if (contents && strstr(contents, ":-"))
 779     return LANG_PROLOG;
 780   else
 781     return LANG_PERL;
 782 }
 783
 784 #define QMAKE_SOURCES_SPACE "SOURCES +="
 785 #define QMAKE_SOURCES "SOURCES+="
 786 #define QMAKE_CONFIG_SPACE "CONFIG +="
 787 #define QMAKE_CONFIG "CONFIG+="
 788
 789 const char *disambiguate_pro(SourceFile *sourcefile) {
 790         char *p = ohcount_sourcefile_get_contents(sourcefile);
 791   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 792         for (; p < eof; p++) {
 793                 if (strncmp(p, QMAKE_SOURCES_SPACE, strlen(QMAKE_SOURCES_SPACE)) == 0 ||
 794                                 strncmp(p, QMAKE_SOURCES, strlen(QMAKE_SOURCES)) == 0 ||
 795                                 strncmp(p, QMAKE_CONFIG_SPACE, strlen(QMAKE_CONFIG_SPACE)) == 0 ||
 796                                 strncmp(p, QMAKE_CONFIG, strlen(QMAKE_CONFIG)) == 0)
 797                         return LANG_MAKE; // really QMAKE
 798         }
 799         return LANG_IDL_PVWAVE;
 800 }
 801
 802 const char *disambiguate_r(SourceFile *sourcefile) {
 803   char *contents = ohcount_sourcefile_get_contents(sourcefile);
 804   if (!contents)
 805     return LANG_R;
 806
 807   char *eof = contents + ohcount_sourcefile_get_contents_size(sourcefile);
 808
 809   // Detect REBOL by looking for the occurence of "rebol" in the contents
 810   // (case-insensitive). Correct REBOL scripts have a "REBOL [...]" header
 811   // block.
 812   char *needle = "rebol";
 813   int len = strlen(needle);
 814   for (; contents < eof - len; ++contents)
 815     if (tolower(*contents) == *needle &&
 816           !strncasecmp(contents, needle, len))
 817       return LANG_REBOL;
 818
 819   return LANG_R;
 820 }
 821
 822 const char *disambiguate_st(SourceFile *sourcefile) {
 823   char *p, *pe;
 824   int length;
 825
 826   // Attempt to detect based on file contents.
 827   int found_assignment = 0, found_block_start = 0, found_block_end = 0;
 828
 829   char line[81];
 830   p = ohcount_sourcefile_get_contents(sourcefile);
 831   pe = p;
 832   char *eof = p + ohcount_sourcefile_get_contents_size(sourcefile);
 833   while (pe < eof) {
 834     // Get a line at a time.
 835     while (pe < eof && *pe != '\r' && *pe != '\n') pe++;
 836     length = (pe - p <= sizeof(line)) ? pe - p : sizeof(line);
 837     strncpy(line, p, length);
 838     line[length] = '\0';
 839     char *eol = line + strlen(line);
 840     char *line_end = pe;
 841
 842     for (p = line; p < eol; p++) {
 843       if (*p == ':') {
 844         p++;
 845         while (p < eol && (*p == ' ' || *p == '\t')) p++;
 846         if (*p == '=')
 847           found_assignment = 1;
 848         else if (*p == '[')
 849           found_block_start = 1;
 850       } else if (*p == ']' && *(p + 1) == '.') found_block_end = 1;
 851       if (found_assignment && found_block_start && found_block_end)
 852         return LANG_SMALLTALK;
 853     }
 854
 855     // Next line.
 856     pe = line_end;
 857     while (*pe == '\r' || *pe == '\n') pe++;
 858     p = pe;
 859   }
 860
 861   return NULL;
 862 }
 863
 864 int ohcount_is_binary_filename(const char *filename) {
 865   char *p = (char *)filename + strlen(filename);
 866   while (p > filename && *(p - 1) != '.') p--;
 867   if (p > filename) {
 868     struct ExtensionMap *re;
 869     int length = strlen(p);
 870     re = ohcount_hash_language_from_ext(p, length);
 871     if (re) return ISBINARY(re->value);
 872     // Try the lower-case version of this extension.
 873     char lowerext[length];
 874     strncpy(lowerext, p, length);
 875     lowerext[length] = '\0';
 876     for (p = lowerext; p < lowerext + length; p++) *p = tolower(*p);
 877     re = ohcount_hash_language_from_ext(lowerext, length);
 878     if (re) return ISBINARY(re->value);
 879   }
 880   return 0;
 881 }