ruby.rl's string and regex literals shouldn't call @code right after '%'.
[ohcount] / ext / ohcount_native / parser.c
1 /*
2  *  parser.c
3  *  Ohcount
4  *
5  *  Created by Jason Allen on 6/23/06.
6  *  Copyright 2006 Ohloh. All rights reserved.
7  *
8  */
9
10 /* #include <mcheck.h> - for memory debugging */
11 #include "ruby.h"
12 #include "common.h"
13 #include "ragel_parser.h"
14
15
16 /*****************************************************************************
17                                  ParseContext
18 *****************************************************************************/
19
20 /*
21  * ParseContext holds all the state required to parse a buffer
22  *
23  * This is where we keep state of an ongoing parse. It's not in the header file because
24  * it's not meant to be shared with anyone else... strictly internal!
25  */
26 typedef struct {
27         // the buffer we're parsing
28         char *parse_buffer;
29
30         // the length of the entire buffer
31         int parse_buffer_len;
32
33         // the current cursor we're processing
34         char *parse_cur;
35
36         // the length of the buffer left to parse
37         int parse_left_len;
38
39         // where the current line started (just past the last newline)
40         char *parse_line_start;
41
42         // where there current state should start attribution from
43         char *attribute_from;
44
45         // the index of the current line (just for debugging)
46         int cur_line_index;
47
48         // the state stack
49         CompiledState *cs_stack[MAX_CS_STACK];
50         int cs_stack_index;
51
52         // the state we're attributing this current line to (NULL means we haven't attributed it yet!)
53         State *line_attributed_state;
54
55         // language_breakdowns
56         LanguageBreakdown language_breakdowns[MAX_LANGUAGE_BREAKDOWN_SIZE];
57         int language_breakdown_count;
58
59 } ParseContext;
60
61 /*
62  * parse_context_find_or_create_language_breakdown
63  *
64  * Will return a valid language_breakdown pointer for a given language name.
65  */
66 LanguageBreakdown *parse_context_find_or_create_language_breakdown(ParseContext *parse_context, char *name) {
67         int i_lb;
68
69         // iterate to find
70         for (i_lb = 0; i_lb < parse_context->language_breakdown_count; i_lb++) {
71                 if (strcmp(parse_context->language_breakdowns[i_lb].name, name) == 0) {
72                         return &parse_context->language_breakdowns[i_lb];
73                 }
74         }
75
76         // doesn't exist, create new onw
77         log("[ohcount] creating language_breakdown: '%s'\n", name);
78 #ifndef NDEBUG
79         if (parse_context->language_breakdown_count >= MAX_LANGUAGE_BREAKDOWN_SIZE) {
80                 log("[ohcount] - ASSERT FAILED: parse_context->language_breakdown_count too big (%d)\n", parse_context->language_breakdown_count);
81         }
82 #endif
83         language_breakdown_initialize(&parse_context->language_breakdowns[parse_context->language_breakdown_count], name, parse_context->parse_buffer_len + 5); /* just in case we pad with newline or something */
84         log("[ohcount] done creating language_breakdown: '%s'\n", name);
85         return &parse_context->language_breakdowns[parse_context->language_breakdown_count++];
86 }
87
88 /*
89  * parse_yield_line
90  *
91  * yeilds the just-processed line back up to an optional Ruby block,
92  * along with its language and semantic information.
93  */
94 void parse_yield_line(ParseContext *parse_context, char *up_to, State *state) {
95         VALUE ary;
96         if (rb_block_given_p()) {
97                 ary = rb_ary_new2(2);
98                 rb_ary_store(ary, 0, ID2SYM(rb_intern(state->language)));
99                 rb_ary_store(ary, 2, rb_str_new(parse_context->parse_line_start, up_to - parse_context->parse_line_start));
100
101                 switch (state->semantic) {
102                         case semantic_code:
103                                 rb_ary_store(ary, 1, ID2SYM(rb_intern("code")));
104                                 break;
105                         case semantic_comment:
106                                 rb_ary_store(ary, 1, ID2SYM(rb_intern("comment")));
107                                 break;
108                         case semantic_blank:
109                         case semantic_null:
110                                 rb_ary_store(ary, 1, ID2SYM(rb_intern("blank")));
111                                 break;
112                         default:
113                                 break;
114                 }
115                 rb_yield(ary);
116         }
117 }
118
119 /*
120  * parse_context_process_line
121  *
122  * will 'consume' the current line (parse_context->parse_line_start to 'up_to').
123  * code and comments are copied, blanks are simply tallied up.
124  */
125 void parse_context_process_line(ParseContext *parse_context, char *up_to, State *state) {
126         parse_yield_line(parse_context, up_to, state);
127
128         LanguageBreakdown *lb = parse_context_find_or_create_language_breakdown(parse_context,state->language);
129         switch (state->semantic) {
130                 case semantic_code:
131                         language_breakdown_copy_code(lb, parse_context->parse_line_start, up_to);
132                         break;
133                 case semantic_comment:
134                         language_breakdown_copy_comment(lb, parse_context->parse_line_start, up_to);
135                         break;
136                 case semantic_null:
137                 case semantic_blank:
138                         log("[ohcount] blankline at line %d\n", parse_context->cur_line_index);
139                         lb->blank_count++;
140                         break;
141                 default:
142                         die("Unknown semantic", ERR_UNKNOWN_SEMANTIC);
143         }
144 }
145
146 /*
147  * parse_context_current_cs
148  *
149  * accessor for the top of the CompiledState stack
150  */
151 CompiledState *parse_context_current_cs(ParseContext *parse_context) {
152 #ifndef NDEBUG
153         if (parse_context->cs_stack_index < 0 ||        parse_context->cs_stack_index >= MAX_CS_STACK) {
154                 log("[ohcount] - ASSERT FAILED: parse_context->cs_stack_index out of bounds (%d)\n", parse_context->cs_stack_index);
155         }
156 #endif
157         if (parse_context->cs_stack_index == 0) {
158                 return NULL;
159         }
160         return parse_context->cs_stack[parse_context->cs_stack_index - 1];
161 }
162
163 /*
164  * parse_context_current_state
165  *
166  * accessor for state represented by the top of the CompiledState stack
167  */
168 State *parse_context_current_state(ParseContext *parse_context) {
169         CompiledState *cs = parse_context_current_cs(parse_context);
170         if (cs == NULL) {
171                 return NULL;
172         }
173         return cs->state;
174 }
175
176 /*
177  * parse_context_last_attributed_semantic
178  *
179  * accessor that returns the current line's attributed semantic (null if none was attributed
180  * yet).
181  */
182 enum Semantic parse_context_last_attributed_semantic(ParseContext *parse_context) {
183         if (parse_context->line_attributed_state == NULL) {
184                 return semantic_null;
185         }
186         return parse_context->line_attributed_state->semantic;
187 }
188
189 /*
190  * parse_context_current_pcre
191  *
192  * returns the pcre (compiled regular expression) for the current state
193  */
194 pcre *parse_context_current_pcre(ParseContext *parse_context) {
195         return parse_context_current_cs(parse_context)->pcre;
196 }
197
198 /*
199  * parse_context_current_pcre_extra
200  *
201  * returns the pcre_extra (compiled regular expression additional hints) for the current state
202  */
203 pcre_extra *parse_context_current_pcre_extra(ParseContext *parse_context) {
204         return parse_context_current_cs(parse_context)->pcre_extra;
205 }
206
207 /*
208  * parse_context_attribute
209  *
210  * Determines whether the chunk of code seen up to 'at' should be attributed to the
211  * current state or not. The rules are pretty simple:
212  *  - semantic_null < semantic_blank < semantic_comment < semantic_code
213  *  - comment and code don't count if there are only blanks characters
214  *
215  * if we ate a newline then we also do some postprocessing -- mostly copy the current
216  * line to the appropriate buffer.
217  *
218  */
219 void parse_context_attribute(ParseContext *parse_context, char *at, bool process_line) {
220         enum Semantic last_semantic = parse_context_last_attributed_semantic(parse_context);
221         State *state = parse_context_current_state(parse_context);
222         bool trumped = state_trumps_language(state, parse_context->line_attributed_state);
223
224         log0("[ohcount] - __ATTRIBUTION__\n");
225 #ifndef NDEBUG
226         char temp_buf[20];
227         int max_chars = (at - parse_context->attribute_from);
228         if (max_chars > 19) {
229                 max_chars = 19;
230         }
231         strncpy(temp_buf, parse_context->attribute_from, max_chars);
232         temp_buf[max_chars] = 0;
233         if (state) {
234                 log2("[ohcount] - state[%s] eating '%s'\n", state->name, temp_buf);
235         } else {
236                 log("[ohcount] - NULL state eating '%s'\n", temp_buf);
237         }
238         State *last_state = parse_context->line_attributed_state;
239         if (last_state) {
240                 log("[ohcount] - last_attributed_state: '%s'\n", last_state->name);
241         }
242 #endif
243
244
245
246         // shortcut -- if we've already found code, nothing else could really make a difference -- just bail
247         if (last_semantic != semantic_code || trumped) {
248
249                 // main code to attribute the chunk of code
250                 log2("[ohcount] - attributing(at[%d], attribute_from[%d]\n", at, parse_context->attribute_from);
251                 if (at > parse_context->attribute_from) {
252
253                         // if we're attributing to blank, we dont care what's in the string
254                         if (state->semantic == semantic_blank && last_semantic == semantic_null) {
255                                 parse_context->line_attributed_state = state;
256                                 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
257                         } else {
258
259                                 // we need to see some non-blank characters before we can acredit a comment or code
260                                 char *cur = parse_context->attribute_from;
261                                 bool saw_non_blank = false;
262                                 while (cur < at) {
263                                         if (*cur > 32) { /* ascii chars below 32 are non-printing */
264                                                 log2("attributing character 0x%x %c\n", (int)*cur, *cur);
265                                                 saw_non_blank = true;
266                                                 cur = at;
267                                         }
268                                         cur++;
269                                 }
270                                 if (saw_non_blank) {
271                                         if (trumped || (
272                                                                 (last_semantic == semantic_blank || last_semantic == semantic_null) &&
273                                                                 (state->semantic == semantic_code || state->semantic == semantic_comment) )) {
274                                                 parse_context->line_attributed_state = state;
275                                                 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
276                                         } else if (last_semantic == semantic_comment && state->semantic == semantic_code) {
277                                                 parse_context->line_attributed_state = state;
278                                                 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
279                                         }
280                                 }
281                         }
282                 }
283         }
284
285         // copy line to appropriate buffer, if appropriate
286         if (process_line) {
287
288                 State *attributed_state = parse_context->line_attributed_state;
289                 State temp_state;
290
291                 if (attributed_state == NULL) {
292                         // if the line is totally blank, we haven't attributed it to anything yet
293                         // instead, we'll invent a temporary state (same language.name as previous state on stack, but semantic blank)
294                         temp_state.language = parse_context_current_state(parse_context)->language;
295                         temp_state.name = parse_context_current_state(parse_context)->name;
296                         temp_state.semantic = semantic_blank;
297                         attributed_state = &temp_state;
298                         log("[ohcount] - eating_newline. line_attributed_state=[MADE UP! semantic blank, language->%s\n", attributed_state->language);
299                 } else {
300                         log("[ohcount] - eating_newline. line_attributed_state=%s\n", attributed_state->name);
301                 }
302
303                 parse_context_process_line(parse_context, at, attributed_state);
304                 parse_context->parse_line_start = at;
305                 parse_context->attribute_from = at;
306                 parse_context->line_attributed_state = NULL;
307                 parse_context->cur_line_index++;
308         }
309
310 }
311
312
313 /*
314  * parse_context_transit
315  *
316  * performs transition to new state (by pushing or popping the compiled_state stack)
317  *
318  */
319 void parse_context_transit(ParseContext *parse_context, CompiledState *cs, char *at) {
320         // push (or pop)
321         if (cs == NULL) {
322 #ifndef NDEBUG
323                 if (parse_context->cs_stack_index <= 0) {
324                         log("[ohcount] - ASSERT FAILED: cs_stack_index underflow (%d)\n", parse_context->cs_stack_index);
325                 }
326 #endif
327                 parse_context->cs_stack[--parse_context->cs_stack_index] = cs;
328         } else {
329 #ifndef NDEBUG
330                 if (parse_context->cs_stack_index + 1 >= MAX_CS_STACK) {
331                         log("[ohcount] - ASSERT FAILED: cs_stack_index overflow (%d)\n", parse_context->cs_stack_index);
332                 }
333 #endif
334                 parse_context->cs_stack[parse_context->cs_stack_index++] = cs;
335         }
336         parse_context->attribute_from = at;
337
338 }
339
340 /*
341  * parse_context_initialize
342  *
343  * Initialized a parse_context to be ready to start parsing.
344  *
345  */
346 void parse_context_initialize(ParseContext *parse_context, char *buffer, int buffer_len, CompiledState *initial_cs_state) {
347         parse_context->parse_buffer = buffer;
348         parse_context->parse_buffer_len = buffer_len;
349         parse_context->parse_cur = buffer;
350         parse_context->attribute_from = buffer;
351         parse_context->parse_left_len = buffer_len;
352         parse_context->parse_line_start = buffer;
353         parse_context->cur_line_index = 1; // editors are 1-based...debugging is easier
354
355         parse_context->cs_stack_index = 0;
356         parse_context->line_attributed_state = NULL;
357         parse_context->language_breakdown_count = 0;
358
359         parse_context_attribute(parse_context, buffer, false);
360         parse_context_transit(parse_context, initial_cs_state, buffer);
361 }
362
363
364 /*
365  * parse_context_get_transition
366  *
367  * returns the "nth" transition from the current parse_context.
368  *
369  */
370 Transition *parse_context_get_transition(ParseContext *parse_context, int transition_index) {
371         CompiledState *compiled_state = parse_context_current_cs(parse_context);
372         return compiled_state_get_transition(compiled_state, transition_index);
373 }
374
375
376
377 /*****************************************************************************
378                                   ParseResult
379 *****************************************************************************/
380
381 /*
382  * parse_result_initialize
383  *
384  * initializes a parse_result from the parse_context
385  *
386  */
387 void parse_result_initialize(ParseResult *pr, ParseContext *parse_context) {
388         int i_lb;
389         for (i_lb = 0; i_lb < parse_context->language_breakdown_count; i_lb++) {
390                         pr->language_breakdowns[i_lb] = parse_context->language_breakdowns[i_lb];
391         }
392         pr->language_breakdown_count = parse_context->language_breakdown_count;
393 }
394
395 /*
396  * parse_result_free
397  *
398  * Deallocates the memory held by a ParseResult.
399  *
400  */
401 void parse_result_free(ParseResult *parse_result) {
402         int i_lb;
403         for (i_lb = 0; i_lb < parse_result->language_breakdown_count; i_lb++) {
404                 language_breakdown_free(&parse_result->language_breakdowns[i_lb]);
405         }
406 }
407
408
409 /*****************************************************************************
410                                      Parser
411 *****************************************************************************/
412
413 /*
414  * parser_print_match
415  *
416  * As a debugging tool, we print out the exact matched string.
417  *
418  */
419 void parser_print_match(ParseContext *parse_context, int *ovector, int result) {
420         char match[10];
421         pcre_copy_substring(parse_context->parse_cur, ovector, result, result - 1, match, 10);
422         if (match[0] == '\n') {
423                 log2("[ohcount] state '%s' matched [%s]\n", parse_context_current_state(parse_context)->name,   "\\n");
424         } else {
425                 log2("[ohcount] state '%s' matched [%s]\n", parse_context_current_state(parse_context)->name,   match);
426         }
427 }
428
429 /*
430  * parser_ate_newline
431  *
432  * returns true if the pcre result ate a newline
433  *
434  */
435 bool parser_ate_newline(ParseContext *parse_context, int *ovector) {
436         char *c = parse_context->parse_cur + ovector[0];
437         char *c_last = parse_context->parse_cur + ovector[1];
438         while (c < c_last) {
439                 if (*c++ == '\n') {
440                         return true;
441                 }
442         }
443         return false;
444 }
445
446
447 /*
448  * parser_parse
449  *
450  * The main parsing algorith consists of doing a DFA walk on the source code.
451  * We start in the initial state of the language and then maintain a stack of
452  * states. Transitions are triggered as regular expression matches (as defined
453  * by each state). At every transition we account for the code seen. We keep
454  * track for each line what semantic we've seen so far. Semantics trump each
455  * other in the following order: null < blank < comment < code. As soon as we
456  * see any code, we pretty much ignore other semantics until the newline. We
457  * do, however, keep parsing since we need to maintain the states properly -
458  * in other words, jumping to the newline might make us forget to jump out
459  * of a string state, or something.
460  *
461  */
462 void parser_parse(ParseResult *pr, char *buffer, int buffer_len, Polyglot *polyglot) {
463 #ifndef NDEBUG
464 /* to help debug, export MALLOC_TRACE to output file */
465 /*      mtrace(); */
466 #endif
467
468         if (ragel_parser_parse(pr, buffer, buffer_len, polyglot->name))
469                 return;
470
471         // make sure we have compiled states
472         polyglot_compile_states(polyglot);
473
474         // setup the parse context
475         ParseContext parse_context;
476         parse_context_initialize(&parse_context, buffer, buffer_len, &polyglot->compiled_states[0]);
477
478         // MAIN_PARSE_LOOP
479         int ovector[30];
480         int result;
481         while ((result = pcre_exec(parse_context_current_pcre(&parse_context), NULL, parse_context.parse_cur,  parse_context.parse_left_len, 0, 0, ovector, 30)) >= 0) {
482
483 #ifndef NDEBUG
484                 log("[ohcount] pcre result: %d\n", result);
485                 parser_print_match(&parse_context, ovector, result);
486 #endif
487
488                 // crappy hack work around to solve surprisingly complex bug
489                 // its all about the last line - how to avoid attributing twice or not at all
490                 // The complexity comes about because sometimes we actually account for it
491                 // "automatically" - like when the file ends with a newline. However, when it doesn't
492                 //
493
494
495                 // transition if possible (there might not be one if its a newline!)
496                 Transition *t = parse_context_get_transition(&parse_context, result - 2); // -1 for pcre_exec storing the entire match first, -1 to be zero-based (pcre is 1-based, kinda)
497
498                 if (t && t->fake_transition) {
499                         // fake transition -- just ignore it!
500                         log0("- fake transition, still in current state");
501                 } else {
502                         CompiledState *target_cs = NULL;
503                         if (t && t->to_state) {
504                                 // find the target compiled_state
505                                 for (target_cs = polyglot->compiled_states; target_cs->state != t->to_state; target_cs++) {}
506                         }
507
508                         // source or target: who eats the matched string itself?
509                         int at = (t == NULL || t->token_eater == FromEatsToken) ? ovector[1] : ovector[0];
510
511                         // attribute the code/text/blanks we've seen so far
512                         bool ate_newline =  parser_ate_newline(&parse_context, ovector);
513                         parse_context_attribute(&parse_context, parse_context.parse_cur + at, ate_newline);
514
515                         // and transit to our new state (note: we usually won't have a transition if we hit a newline)
516                         // set the 'at' at the proper place, depending on TokenEater
517                         at = (t != NULL && (t->token_eater == ToEatsToken)) ? ovector[0] : ovector[1];
518                         if (t && !t->fake_transition) {
519                                 log("[ohcount] - transition at %d\n", parse_context.parse_cur + at);
520                                 parse_context_transit(&parse_context, target_cs, parse_context.parse_cur + at);
521                         }
522
523 #ifndef NDEBUG
524                         if (ate_newline) {
525                         log2("[ohcount] -- starting line %d in state %s\n", parse_context.cur_line_index, parse_context_current_state(&parse_context)->name);
526                         }
527 #endif
528                 }
529                 // move forward
530                 int jump_chars = (ovector[0] > ovector[1]) ? ovector[0] : ovector[1];
531                 parse_context.parse_left_len -= jump_chars ;
532                 parse_context.parse_cur += jump_chars;
533         }
534
535         switch (result) {
536                 case PCRE_ERROR_NOMATCH:
537                         // attribute what we (might have) eaten so far...
538                         if (parse_context.parse_cur[parse_context.parse_left_len - 1] != '\n') {
539                                 parse_context_attribute(&parse_context, parse_context.parse_cur + parse_context.parse_left_len, true);
540                         }
541                         break;
542                 case PCRE_ERROR_NOMEMORY:
543                         die("PCRE_ERROR_NOMEMORY", ERR_PCRE_OUT_OF_MEMORY);
544                         break;
545                 default:
546                         die("PCRE ERROR", ERR_PCRE_GENERIC);
547                         break;
548         }
549
550         /* setup the parse result */
551         parse_result_initialize(pr, &parse_context);
552
553 #ifndef NDEBUG
554 /*      muntrace(); */
555 #endif
556 }
557
558
559 /*
560  * parser_test
561  *
562  * internal testing code
563  */
564 void parser_test() {
565         char buffer[] = "\"str//i\\\"ng\"";
566         ParseResult pr;
567         int len = strlen(buffer);
568         parser_parse(&pr, buffer, len, POLYGLOTS[0]);
569         printf("parsing this buffer:\n%s\n=============\n", buffer);
570         printf("__code_start__\n%s\n__code_end__\n", pr.language_breakdowns[0].code);
571         printf("__comment_start__\n%s\n__comment_end__\n", pr.language_breakdowns[0].comment);
572         printf("blanks[%d]\n", pr.language_breakdowns[0].blank_count);
573 }