Initial Revision
[ohcount] / ext / ohcount_native / parser.c
1 /*
2  *  parser.c
3  *  Ohcount
4  *
5  *  Created by Jason Allen on 6/23/06.
6  *  Copyright 2006 Ohloh. All rights reserved.
7  *
8  */
9
10 /* #include <mcheck.h> - for memory debugging */
11 #include "ruby.h"
12 #include "common.h"
13
14
15 /*****************************************************************************
16                                  ParseContext
17 *****************************************************************************/
18
19 /*
20  * ParseContext holds all the state required to parse a buffer
21  *
22  * This is where we keep state of an ongoing parse. It's not in the header file because
23  * it's not meant to be shared with anyone else... strictly internal!
24  */
25 typedef struct {
26         // the buffer we're parsing
27         char *parse_buffer;
28
29         // the length of the entire buffer
30         int parse_buffer_len;
31
32         // the current cursor we're processing
33         char *parse_cur;
34
35         // the length of the buffer left to parse
36         int parse_left_len;
37
38         // where the current line started (just past the last newline)
39         char *parse_line_start;
40
41         // where there current state should start attribution from
42         char *attribute_from;
43
44         // the index of the current line (just for debugging)
45         int cur_line_index;
46
47         // the state stack
48         CompiledState *cs_stack[MAX_CS_STACK];
49         int cs_stack_index;
50
51         // the state we're attributing this current line to (NULL means we haven't attributed it yet!)
52         State *line_attributed_state;
53
54         // language_breakdowns
55         LanguageBreakdown language_breakdowns[MAX_LANGUAGE_BREAKDOWN_SIZE];
56         int language_breakdown_count;
57
58 } ParseContext;
59
60 /*
61  * parse_context_find_or_create_language_breakdown
62  *
63  * Will return a valid language_breakdown pointer for a given language name.
64  */
65 LanguageBreakdown *parse_context_find_or_create_language_breakdown(ParseContext *parse_context, char *name) {
66         int i_lb;
67
68         // iterate to find
69         for (i_lb = 0; i_lb < parse_context->language_breakdown_count; i_lb++) {
70                 if (strcmp(parse_context->language_breakdowns[i_lb].name, name) == 0) {
71                         return &parse_context->language_breakdowns[i_lb];
72                 }
73         }
74
75         // doesn't exist, create new onw
76         log("[ohcount] creating language_breakdown: '%s'\n", name);
77 #ifndef NDEBUG
78         if (parse_context->language_breakdown_count >= MAX_LANGUAGE_BREAKDOWN_SIZE) {
79                 log("[ohcount] - ASSERT FAILED: parse_context->language_breakdown_count too big (%d)\n", parse_context->language_breakdown_count);
80         }
81 #endif
82         language_breakdown_initialize(&parse_context->language_breakdowns[parse_context->language_breakdown_count], name, parse_context->parse_buffer_len + 5); /* just in case we pad with newline or something */
83         log("[ohcount] done creating language_breakdown: '%s'\n", name);
84         return &parse_context->language_breakdowns[parse_context->language_breakdown_count++];
85 }
86
87 /*
88  * parse_yield_line
89  *
90  * yeilds the just-processed line back up to an optional Ruby block,
91  * along with its language and semantic information.
92  */
93 void parse_yield_line(ParseContext *parse_context, char *up_to, State *state) {
94         VALUE ary;
95         if (rb_block_given_p()) {
96                 ary = rb_ary_new2(2);
97                 rb_ary_store(ary, 0, ID2SYM(rb_intern(state->language)));
98                 rb_ary_store(ary, 2, rb_str_new(parse_context->parse_line_start, up_to - parse_context->parse_line_start));
99
100                 switch (state->semantic) {
101                         case semantic_code:
102                                 rb_ary_store(ary, 1, ID2SYM(rb_intern("code")));
103                                 break;
104                         case semantic_comment:
105                                 rb_ary_store(ary, 1, ID2SYM(rb_intern("comment")));
106                                 break;
107                         case semantic_blank:
108                         case semantic_null:
109                                 rb_ary_store(ary, 1, ID2SYM(rb_intern("blank")));
110                                 break;
111                         default:
112                                 break;
113                 }
114                 rb_yield(ary);
115         }
116 }
117
118 /*
119  * parse_context_process_line
120  *
121  * will 'consume' the current line (parse_context->parse_line_start to 'up_to').
122  * code and comments are copied, blanks are simply tallied up.
123  */
124 void parse_context_process_line(ParseContext *parse_context, char *up_to, State *state) {
125         parse_yield_line(parse_context, up_to, state);
126
127         LanguageBreakdown *lb = parse_context_find_or_create_language_breakdown(parse_context,state->language);
128         switch (state->semantic) {
129                 case semantic_code:
130                         language_breakdown_copy_code(lb, parse_context->parse_line_start, up_to);
131                         break;
132                 case semantic_comment:
133                         language_breakdown_copy_comment(lb, parse_context->parse_line_start, up_to);
134                         break;
135                 case semantic_null:
136                 case semantic_blank:
137                         log("[ohcount] blankline at line %d\n", parse_context->cur_line_index);
138                         lb->blank_count++;
139                         break;
140                 default:
141                         die("Unknown semantic", ERR_UNKNOWN_SEMANTIC);
142         }
143 }
144
145 /*
146  * parse_context_current_cs
147  *
148  * accessor for the top of the CompiledState stack
149  */
150 CompiledState *parse_context_current_cs(ParseContext *parse_context) {
151 #ifndef NDEBUG
152         if (parse_context->cs_stack_index < 0 ||        parse_context->cs_stack_index >= MAX_CS_STACK) {
153                 log("[ohcount] - ASSERT FAILED: parse_context->cs_stack_index out of bounds (%d)\n", parse_context->cs_stack_index);
154         }
155 #endif
156         if (parse_context->cs_stack_index == 0) {
157                 return NULL;
158         }
159         return parse_context->cs_stack[parse_context->cs_stack_index - 1];
160 }
161
162 /*
163  * parse_context_current_state
164  *
165  * accessor for state represented by the top of the CompiledState stack
166  */
167 State *parse_context_current_state(ParseContext *parse_context) {
168         CompiledState *cs = parse_context_current_cs(parse_context);
169         if (cs == NULL) {
170                 return NULL;
171         }
172         return cs->state;
173 }
174
175 /*
176  * parse_context_last_attributed_semantic
177  *
178  * accessor that returns the current line's attributed semantic (null if none was attributed
179  * yet).
180  */
181 enum Semantic parse_context_last_attributed_semantic(ParseContext *parse_context) {
182         if (parse_context->line_attributed_state == NULL) {
183                 return semantic_null;
184         }
185         return parse_context->line_attributed_state->semantic;
186 }
187
188 /*
189  * parse_context_current_pcre
190  *
191  * returns the pcre (compiled regular expression) for the current state
192  */
193 pcre *parse_context_current_pcre(ParseContext *parse_context) {
194         return parse_context_current_cs(parse_context)->pcre;
195 }
196
197 /*
198  * parse_context_current_pcre_extra
199  *
200  * returns the pcre_extra (compiled regular expression additional hints) for the current state
201  */
202 pcre_extra *parse_context_current_pcre_extra(ParseContext *parse_context) {
203         return parse_context_current_cs(parse_context)->pcre_extra;
204 }
205
206 /*
207  * parse_context_attribute
208  *
209  * Determines whether the chunk of code seen up to 'at' should be attributed to the
210  * current state or not. The rules are pretty simple:
211  *  - semantic_null < semantic_blank < semantic_comment < semantic_code
212  *  - comment and code don't count if there are only blanks characters
213  *
214  * if we ate a newline then we also do some postprocessing -- mostly copy the current
215  * line to the appropriate buffer.
216  *
217  */
218 void parse_context_attribute(ParseContext *parse_context, char *at, bool process_line) {
219         enum Semantic last_semantic = parse_context_last_attributed_semantic(parse_context);
220         State *state = parse_context_current_state(parse_context);
221         bool trumped = state_trumps_language(state, parse_context->line_attributed_state);
222
223         log0("[ohcount] - __ATTRIBUTION__\n");
224 #ifndef NDEBUG
225         char temp_buf[20];
226         int max_chars = (at - parse_context->attribute_from);
227         if (max_chars > 19) {
228                 max_chars = 19;
229         }
230         strncpy(temp_buf, parse_context->attribute_from, max_chars);
231         temp_buf[max_chars] = 0;
232         if (state) {
233                 log2("[ohcount] - state[%s] eating '%s'\n", state->name, temp_buf);
234         } else {
235                 log("[ohcount] - NULL state eating '%s'\n", temp_buf);
236         }
237         State *last_state = parse_context->line_attributed_state;
238         if (last_state) {
239                 log("[ohcount] - last_attirbuted_state: '%s'\n", last_state->name);
240         }
241 #endif
242
243
244
245         // shortcut -- if we've already found code, nothing else could really make a difference -- just bail
246         if (last_semantic != semantic_code || trumped) {
247
248                 // main code to attribute the chunk of code
249                 log2("[ohcount] - attributing(at[%d], attribute_from[%d]\n", at, parse_context->attribute_from);
250                 if (at > parse_context->attribute_from) {
251
252                         // if we're attributing to blank, we dont care what's in the string
253                         if (state->semantic == semantic_blank && last_semantic == semantic_null) {
254                                 parse_context->line_attributed_state = state;
255                                 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
256                         } else {
257
258                                 // we need to see some non-blank characters before we can acredit a comment or code
259                                 char *cur = parse_context->attribute_from;
260                                 bool saw_non_blank = false;
261                                 while (cur < at) {
262                                         if (*cur > 32) { /* ascii chars below 32 are non-printing */
263                                                 log2("attributing character 0x%x %c\n", (int)*cur, *cur);
264                                                 saw_non_blank = true;
265                                                 cur = at;
266                                         }
267                                         cur++;
268                                 }
269                                 if (saw_non_blank) {
270                                         if (trumped || (
271                                                                 (last_semantic == semantic_blank || last_semantic == semantic_null) &&
272                                                                 (state->semantic == semantic_code || state->semantic == semantic_comment) )) {
273                                                 parse_context->line_attributed_state = state;
274                                                 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
275                                         } else if (last_semantic == semantic_comment && state->semantic == semantic_code) {
276                                                 parse_context->line_attributed_state = state;
277                                                 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
278                                         }
279                                 }
280                         }
281                 }
282         }
283
284         // copy line to appropriate buffer, if appropriate
285         if (process_line) {
286
287                 State *attributed_state = parse_context->line_attributed_state;
288                 State temp_state;
289
290                 if (attributed_state == NULL) {
291                         // if the line is totally blank, we haven't attributed it to anything yet
292                         // instead, we'll invent a temporary state (same language.name as previous state on stack, but semantic blank)
293                         temp_state.language = parse_context_current_state(parse_context)->language;
294                         temp_state.name = parse_context_current_state(parse_context)->name;
295                         temp_state.semantic = semantic_blank;
296                         attributed_state = &temp_state;
297                         log("[ohcount] - eating_newline. line_attributed_state=[MADE UP! semantic blank, language->%s\n", attributed_state->language);
298                 } else {
299                         log("[ohcount] - eating_newline. line_attributed_state=%s\n", attributed_state->name);
300                 }
301
302                 parse_context_process_line(parse_context, at, attributed_state);
303                 parse_context->parse_line_start = at;
304                 parse_context->attribute_from = at;
305                 parse_context->line_attributed_state = NULL;
306                 parse_context->cur_line_index++;
307         }
308
309 }
310
311
312 /*
313  * parse_context_transit
314  *
315  * performs transition to new state (by pushing or popping the compiled_state stack)
316  *
317  */
318 void parse_context_transit(ParseContext *parse_context, CompiledState *cs, char *at) {
319         // push (or pop)
320         if (cs == NULL) {
321 #ifndef NDEBUG
322                 if (parse_context->cs_stack_index <= 0) {
323                         log("[ohcount] - ASSERT FAILED: cs_stack_index underflow (%d)\n", parse_context->cs_stack_index);
324                 }
325 #endif
326                 parse_context->cs_stack[--parse_context->cs_stack_index] = cs;
327         } else {
328 #ifndef NDEBUG
329                 if (parse_context->cs_stack_index + 1 >= MAX_CS_STACK) {
330                         log("[ohcount] - ASSERT FAILED: cs_stack_index overflow (%d)\n", parse_context->cs_stack_index);
331                 }
332 #endif
333                 parse_context->cs_stack[parse_context->cs_stack_index++] = cs;
334         }
335         parse_context->attribute_from = at;
336
337 }
338
339 /*
340  * parse_context_initialize
341  *
342  * Initialized a parse_context to be ready to start parsing.
343  *
344  */
345 void parse_context_initialize(ParseContext *parse_context, char *buffer, int buffer_len, CompiledState *initial_cs_state) {
346         parse_context->parse_buffer = buffer;
347         parse_context->parse_buffer_len = buffer_len;
348         parse_context->parse_cur = buffer;
349         parse_context->attribute_from = buffer;
350         parse_context->parse_left_len = buffer_len;
351         parse_context->parse_line_start = buffer;
352         parse_context->cur_line_index = 1; // editors are 1-based...debugging is easier
353
354         parse_context->cs_stack_index = 0;
355         parse_context->line_attributed_state = NULL;
356         parse_context->language_breakdown_count = 0;
357
358         parse_context_attribute(parse_context, buffer, false);
359         parse_context_transit(parse_context, initial_cs_state, buffer);
360 }
361
362
363 /*
364  * parse_context_get_transition
365  *
366  * returns the "nth" transition from the current parse_context.
367  *
368  */
369 Transition *parse_context_get_transition(ParseContext *parse_context, int transition_index) {
370         CompiledState *compiled_state = parse_context_current_cs(parse_context);
371         return compiled_state_get_transition(compiled_state, transition_index);
372 }
373
374
375
376 /*****************************************************************************
377                                   ParseResult
378 *****************************************************************************/
379
380 /*
381  * parse_result_initialize
382  *
383  * initializes a parse_result from the parse_context
384  *
385  */
386 void parse_result_initialize(ParseResult *pr, ParseContext *parse_context) {
387         int i_lb;
388         for (i_lb = 0; i_lb < parse_context->language_breakdown_count; i_lb++) {
389                         pr->language_breakdowns[i_lb] = parse_context->language_breakdowns[i_lb];
390         }
391         pr->language_breakdown_count = parse_context->language_breakdown_count;
392 }
393
394 /*
395  * parse_result_free
396  *
397  * Deallocates the memory held by a ParseResult.
398  *
399  */
400 void parse_result_free(ParseResult *parse_result) {
401         int i_lb;
402         for (i_lb = 0; i_lb < parse_result->language_breakdown_count; i_lb++) {
403                 language_breakdown_free(&parse_result->language_breakdowns[i_lb]);
404         }
405 }
406
407
408 /*****************************************************************************
409                                      Parser
410 *****************************************************************************/
411
412 /*
413  * parser_print_match
414  *
415  * As a debugging tool, we print out the exact matched string.
416  *
417  */
418 void parser_print_match(ParseContext *parse_context, int *ovector, int result) {
419         char match[10];
420         pcre_copy_substring(parse_context->parse_cur, ovector, result, result - 1, match, 10);
421         if (match[0] == '\n') {
422                 log2("[ohcount] state '%s' matched [%s]\n", parse_context_current_state(parse_context)->name,   "\\n");
423         } else {
424                 log2("[ohcount] state '%s' matched [%s]\n", parse_context_current_state(parse_context)->name,   match);
425         }
426 }
427
428 /*
429  * parser_ate_newline
430  *
431  * returns true if the pcre result ate a newline
432  *
433  */
434 bool parser_ate_newline(ParseContext *parse_context, int *ovector) {
435         char *c = parse_context->parse_cur + ovector[0];
436         char *c_last = parse_context->parse_cur + ovector[1];
437         while (c < c_last) {
438                 if (*c++ == '\n') {
439                         return true;
440                 }
441         }
442         return false;
443 }
444
445
446 /*
447  * parser_parse
448  *
449  * The main parsing algorith consists of doing a DFA walk on the source code.
450  * We start in the initial state of the language and then maintain a stack of
451  * states. Transitions are triggered as regular expression matches (as defined
452  * by each state). At every transition we account for the code seen. We keep
453  * track for each line what semantic we've seen so far. Semantics trump each
454  * other in the following order: null < blank < comment < code. As soon as we
455  * see any code, we pretty much ignore other semantics until the newline. We
456  * do, however, keep parsing since we need to maintain the states properly -
457  * in other words, jumping to the newline might make us forget to jump out
458  * of a string state, or something.
459  *
460  */
461 void parser_parse(ParseResult *pr, char *buffer, int buffer_len, Polyglot *polyglot) {
462 #ifndef NDEBUG
463 /* to help debug, export MALLOC_TRACE to output file */
464 /*      mtrace(); */
465 #endif
466
467         // make sure we have compiled states
468         polyglot_compile_states(polyglot);
469
470         // setup the parse context
471         ParseContext parse_context;
472         parse_context_initialize(&parse_context, buffer, buffer_len, &polyglot->compiled_states[0]);
473
474         // MAIN_PARSE_LOOP
475         int ovector[30];
476         int result;
477         while ((result = pcre_exec(parse_context_current_pcre(&parse_context), NULL, parse_context.parse_cur,  parse_context.parse_left_len, 0, 0, ovector, 30)) >= 0) {
478
479 #ifndef NDEBUG
480                 log("[ohcount] pcre result: %d\n", result);
481                 parser_print_match(&parse_context, ovector, result);
482 #endif
483
484                 // crappy hack work around to solve surprisingly complex bug
485                 // its all about the last line - how to avoid attributing twice or not at all
486                 // The complexity comes about because sometimes we actually account for it
487                 // "automatically" - like when the file ends with a newline. However, when it doesn't
488                 //
489
490
491                 // transition if possible (there might not be one if its a newline!)
492                 Transition *t = parse_context_get_transition(&parse_context, result - 2); // -1 for pcre_exec storing the entire match first, -1 to be zero-based (pcre is 1-based, kinda)
493
494                 if (t && t->fake_transition) {
495                         // fake transition -- just ignore it!
496                         log0("- fake transition, still in current state");
497                 } else {
498                         CompiledState *target_cs = NULL;
499                         if (t && t->to_state) {
500                                 // find the target compiled_state
501                                 for (target_cs = polyglot->compiled_states; target_cs->state != t->to_state; target_cs++) {}
502                         }
503
504                         // source or target: who eats the matched string itself?
505                         int at = (t == NULL || t->token_eater == FromEatsToken) ? ovector[1] : ovector[0];
506
507                         // attribute the code/text/blanks we've seen so far
508                         bool ate_newline =  parser_ate_newline(&parse_context, ovector);
509                         parse_context_attribute(&parse_context, parse_context.parse_cur + at, ate_newline);
510
511                         // and transit to our new state (note: we usually won't have a transition if we hit a newline)
512                         // set the 'at' at the proper place, depending on TokenEater
513                         at = (t != NULL && (t->token_eater == ToEatsToken)) ? ovector[0] : ovector[1];
514                         if (t && !t->fake_transition) {
515                                 log("[ohcount] - transition at %d\n", parse_context.parse_cur + at);
516                                 parse_context_transit(&parse_context, target_cs, parse_context.parse_cur + at);
517                         }
518
519 #ifndef NDEBUG
520                         if (ate_newline) {
521                         log2("[ohcount] -- starting line %d in state %s\n", parse_context.cur_line_index, parse_context_current_state(&parse_context)->name);
522                         }
523 #endif
524                 }
525                 // move forward
526                 int jump_chars = (ovector[0] > ovector[1]) ? ovector[0] : ovector[1];
527                 parse_context.parse_left_len -= jump_chars ;
528                 parse_context.parse_cur += jump_chars;
529         }
530
531         switch (result) {
532                 case PCRE_ERROR_NOMATCH:
533                         // attribute what we (might have) eaten so far...
534                         if (parse_context.parse_cur[parse_context.parse_left_len - 1] != '\n') {
535                                 parse_context_attribute(&parse_context, parse_context.parse_cur + parse_context.parse_left_len, true);
536                         }
537                         break;
538                 case PCRE_ERROR_NOMEMORY:
539                         die("PCRE_ERROR_NOMEMORY", ERR_PCRE_OUT_OF_MEMORY);
540                         break;
541                 default:
542                         die("PCRE ERROR", ERR_PCRE_GENERIC);
543                         break;
544         }
545
546         /* setup the parse result */
547         parse_result_initialize(pr, &parse_context);
548
549 #ifndef NDEBUG
550 /*      muntrace(); */
551 #endif
552 }
553
554
555 /*
556  * parser_test
557  *
558  * internal testing code
559  */
560 void parser_test() {
561         char buffer[] = "\"str//i\\\"ng\"";
562         ParseResult pr;
563         int len = strlen(buffer);
564         parser_parse(&pr, buffer, len, POLYGLOTS[0]);
565         printf("parsing this buffer:\n%s\n=============\n", buffer);
566         printf("__code_start__\n%s\n__code_end__\n", pr.language_breakdowns[0].code);
567         printf("__comment_start__\n%s\n__comment_end__\n", pr.language_breakdowns[0].comment);
568         printf("blanks[%d]\n", pr.language_breakdowns[0].blank_count);
569 }