5 * Created by Jason Allen on 6/23/06.
6 * Copyright 2006 Ohloh. All rights reserved.
10 /* #include <mcheck.h> - for memory debugging */
13 #include "ragel_parser.h"
16 /*****************************************************************************
18 *****************************************************************************/
21 * ParseContext holds all the state required to parse a buffer
23 * This is where we keep state of an ongoing parse. It's not in the header file because
24 * it's not meant to be shared with anyone else... strictly internal!
27 // the buffer we're parsing
30 // the length of the entire buffer
33 // the current cursor we're processing
36 // the length of the buffer left to parse
39 // where the current line started (just past the last newline)
40 char *parse_line_start;
42 // where there current state should start attribution from
45 // the index of the current line (just for debugging)
49 CompiledState *cs_stack[MAX_CS_STACK];
52 // the state we're attributing this current line to (NULL means we haven't attributed it yet!)
53 State *line_attributed_state;
55 // language_breakdowns
56 LanguageBreakdown language_breakdowns[MAX_LANGUAGE_BREAKDOWN_SIZE];
57 int language_breakdown_count;
62 * parse_context_find_or_create_language_breakdown
64 * Will return a valid language_breakdown pointer for a given language name.
66 LanguageBreakdown *parse_context_find_or_create_language_breakdown(ParseContext *parse_context, char *name) {
70 for (i_lb = 0; i_lb < parse_context->language_breakdown_count; i_lb++) {
71 if (strcmp(parse_context->language_breakdowns[i_lb].name, name) == 0) {
72 return &parse_context->language_breakdowns[i_lb];
76 // doesn't exist, create new onw
77 log("[ohcount] creating language_breakdown: '%s'\n", name);
79 if (parse_context->language_breakdown_count >= MAX_LANGUAGE_BREAKDOWN_SIZE) {
80 log("[ohcount] - ASSERT FAILED: parse_context->language_breakdown_count too big (%d)\n", parse_context->language_breakdown_count);
83 language_breakdown_initialize(&parse_context->language_breakdowns[parse_context->language_breakdown_count], name, parse_context->parse_buffer_len + 5); /* just in case we pad with newline or something */
84 log("[ohcount] done creating language_breakdown: '%s'\n", name);
85 return &parse_context->language_breakdowns[parse_context->language_breakdown_count++];
91 * yeilds the just-processed line back up to an optional Ruby block,
92 * along with its language and semantic information.
94 void parse_yield_line(ParseContext *parse_context, char *up_to, State *state) {
96 if (rb_block_given_p()) {
98 rb_ary_store(ary, 0, ID2SYM(rb_intern(state->language)));
99 rb_ary_store(ary, 2, rb_str_new(parse_context->parse_line_start, up_to - parse_context->parse_line_start));
101 switch (state->semantic) {
103 rb_ary_store(ary, 1, ID2SYM(rb_intern("code")));
105 case semantic_comment:
106 rb_ary_store(ary, 1, ID2SYM(rb_intern("comment")));
110 rb_ary_store(ary, 1, ID2SYM(rb_intern("blank")));
120 * parse_context_process_line
122 * will 'consume' the current line (parse_context->parse_line_start to 'up_to').
123 * code and comments are copied, blanks are simply tallied up.
125 void parse_context_process_line(ParseContext *parse_context, char *up_to, State *state) {
126 parse_yield_line(parse_context, up_to, state);
128 LanguageBreakdown *lb = parse_context_find_or_create_language_breakdown(parse_context,state->language);
129 switch (state->semantic) {
131 language_breakdown_copy_code(lb, parse_context->parse_line_start, up_to);
133 case semantic_comment:
134 language_breakdown_copy_comment(lb, parse_context->parse_line_start, up_to);
138 log("[ohcount] blankline at line %d\n", parse_context->cur_line_index);
142 die("Unknown semantic", ERR_UNKNOWN_SEMANTIC);
147 * parse_context_current_cs
149 * accessor for the top of the CompiledState stack
151 CompiledState *parse_context_current_cs(ParseContext *parse_context) {
153 if (parse_context->cs_stack_index < 0 || parse_context->cs_stack_index >= MAX_CS_STACK) {
154 log("[ohcount] - ASSERT FAILED: parse_context->cs_stack_index out of bounds (%d)\n", parse_context->cs_stack_index);
157 if (parse_context->cs_stack_index == 0) {
160 return parse_context->cs_stack[parse_context->cs_stack_index - 1];
164 * parse_context_current_state
166 * accessor for state represented by the top of the CompiledState stack
168 State *parse_context_current_state(ParseContext *parse_context) {
169 CompiledState *cs = parse_context_current_cs(parse_context);
177 * parse_context_last_attributed_semantic
179 * accessor that returns the current line's attributed semantic (null if none was attributed
182 enum Semantic parse_context_last_attributed_semantic(ParseContext *parse_context) {
183 if (parse_context->line_attributed_state == NULL) {
184 return semantic_null;
186 return parse_context->line_attributed_state->semantic;
190 * parse_context_current_pcre
192 * returns the pcre (compiled regular expression) for the current state
194 pcre *parse_context_current_pcre(ParseContext *parse_context) {
195 return parse_context_current_cs(parse_context)->pcre;
199 * parse_context_current_pcre_extra
201 * returns the pcre_extra (compiled regular expression additional hints) for the current state
203 pcre_extra *parse_context_current_pcre_extra(ParseContext *parse_context) {
204 return parse_context_current_cs(parse_context)->pcre_extra;
208 * parse_context_attribute
210 * Determines whether the chunk of code seen up to 'at' should be attributed to the
211 * current state or not. The rules are pretty simple:
212 * - semantic_null < semantic_blank < semantic_comment < semantic_code
213 * - comment and code don't count if there are only blanks characters
215 * if we ate a newline then we also do some postprocessing -- mostly copy the current
216 * line to the appropriate buffer.
219 void parse_context_attribute(ParseContext *parse_context, char *at, bool process_line) {
220 enum Semantic last_semantic = parse_context_last_attributed_semantic(parse_context);
221 State *state = parse_context_current_state(parse_context);
222 bool trumped = state_trumps_language(state, parse_context->line_attributed_state);
224 log0("[ohcount] - __ATTRIBUTION__\n");
227 int max_chars = (at - parse_context->attribute_from);
228 if (max_chars > 19) {
231 strncpy(temp_buf, parse_context->attribute_from, max_chars);
232 temp_buf[max_chars] = 0;
234 log2("[ohcount] - state[%s] eating '%s'\n", state->name, temp_buf);
236 log("[ohcount] - NULL state eating '%s'\n", temp_buf);
238 State *last_state = parse_context->line_attributed_state;
240 log("[ohcount] - last_attributed_state: '%s'\n", last_state->name);
246 // shortcut -- if we've already found code, nothing else could really make a difference -- just bail
247 if (last_semantic != semantic_code || trumped) {
249 // main code to attribute the chunk of code
250 log2("[ohcount] - attributing(at[%d], attribute_from[%d]\n", at, parse_context->attribute_from);
251 if (at > parse_context->attribute_from) {
253 // if we're attributing to blank, we dont care what's in the string
254 if (state->semantic == semantic_blank && last_semantic == semantic_null) {
255 parse_context->line_attributed_state = state;
256 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
259 // we need to see some non-blank characters before we can acredit a comment or code
260 char *cur = parse_context->attribute_from;
261 bool saw_non_blank = false;
263 if (*cur > 32) { /* ascii chars below 32 are non-printing */
264 log2("attributing character 0x%x %c\n", (int)*cur, *cur);
265 saw_non_blank = true;
272 (last_semantic == semantic_blank || last_semantic == semantic_null) &&
273 (state->semantic == semantic_code || state->semantic == semantic_comment) )) {
274 parse_context->line_attributed_state = state;
275 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
276 } else if (last_semantic == semantic_comment && state->semantic == semantic_code) {
277 parse_context->line_attributed_state = state;
278 log2("[ohcount] - line %d now being assigned state '%s'\n", parse_context->cur_line_index, state->name);
285 // copy line to appropriate buffer, if appropriate
288 State *attributed_state = parse_context->line_attributed_state;
291 if (attributed_state == NULL) {
292 // if the line is totally blank, we haven't attributed it to anything yet
293 // instead, we'll invent a temporary state (same language.name as previous state on stack, but semantic blank)
294 temp_state.language = parse_context_current_state(parse_context)->language;
295 temp_state.name = parse_context_current_state(parse_context)->name;
296 temp_state.semantic = semantic_blank;
297 attributed_state = &temp_state;
298 log("[ohcount] - eating_newline. line_attributed_state=[MADE UP! semantic blank, language->%s\n", attributed_state->language);
300 log("[ohcount] - eating_newline. line_attributed_state=%s\n", attributed_state->name);
303 parse_context_process_line(parse_context, at, attributed_state);
304 parse_context->parse_line_start = at;
305 parse_context->attribute_from = at;
306 parse_context->line_attributed_state = NULL;
307 parse_context->cur_line_index++;
314 * parse_context_transit
316 * performs transition to new state (by pushing or popping the compiled_state stack)
319 void parse_context_transit(ParseContext *parse_context, CompiledState *cs, char *at) {
323 if (parse_context->cs_stack_index <= 0) {
324 log("[ohcount] - ASSERT FAILED: cs_stack_index underflow (%d)\n", parse_context->cs_stack_index);
327 parse_context->cs_stack[--parse_context->cs_stack_index] = cs;
330 if (parse_context->cs_stack_index + 1 >= MAX_CS_STACK) {
331 log("[ohcount] - ASSERT FAILED: cs_stack_index overflow (%d)\n", parse_context->cs_stack_index);
334 parse_context->cs_stack[parse_context->cs_stack_index++] = cs;
336 parse_context->attribute_from = at;
341 * parse_context_initialize
343 * Initialized a parse_context to be ready to start parsing.
346 void parse_context_initialize(ParseContext *parse_context, char *buffer, int buffer_len, CompiledState *initial_cs_state) {
347 parse_context->parse_buffer = buffer;
348 parse_context->parse_buffer_len = buffer_len;
349 parse_context->parse_cur = buffer;
350 parse_context->attribute_from = buffer;
351 parse_context->parse_left_len = buffer_len;
352 parse_context->parse_line_start = buffer;
353 parse_context->cur_line_index = 1; // editors are 1-based...debugging is easier
355 parse_context->cs_stack_index = 0;
356 parse_context->line_attributed_state = NULL;
357 parse_context->language_breakdown_count = 0;
359 parse_context_attribute(parse_context, buffer, false);
360 parse_context_transit(parse_context, initial_cs_state, buffer);
365 * parse_context_get_transition
367 * returns the "nth" transition from the current parse_context.
370 Transition *parse_context_get_transition(ParseContext *parse_context, int transition_index) {
371 CompiledState *compiled_state = parse_context_current_cs(parse_context);
372 return compiled_state_get_transition(compiled_state, transition_index);
377 /*****************************************************************************
379 *****************************************************************************/
382 * parse_result_initialize
384 * initializes a parse_result from the parse_context
387 void parse_result_initialize(ParseResult *pr, ParseContext *parse_context) {
389 for (i_lb = 0; i_lb < parse_context->language_breakdown_count; i_lb++) {
390 pr->language_breakdowns[i_lb] = parse_context->language_breakdowns[i_lb];
392 pr->language_breakdown_count = parse_context->language_breakdown_count;
398 * Deallocates the memory held by a ParseResult.
401 void parse_result_free(ParseResult *parse_result) {
403 for (i_lb = 0; i_lb < parse_result->language_breakdown_count; i_lb++) {
404 language_breakdown_free(&parse_result->language_breakdowns[i_lb]);
409 /*****************************************************************************
411 *****************************************************************************/
416 * As a debugging tool, we print out the exact matched string.
419 void parser_print_match(ParseContext *parse_context, int *ovector, int result) {
421 pcre_copy_substring(parse_context->parse_cur, ovector, result, result - 1, match, 10);
422 if (match[0] == '\n') {
423 log2("[ohcount] state '%s' matched [%s]\n", parse_context_current_state(parse_context)->name, "\\n");
425 log2("[ohcount] state '%s' matched [%s]\n", parse_context_current_state(parse_context)->name, match);
432 * returns true if the pcre result ate a newline
435 bool parser_ate_newline(ParseContext *parse_context, int *ovector) {
436 char *c = parse_context->parse_cur + ovector[0];
437 char *c_last = parse_context->parse_cur + ovector[1];
450 * The main parsing algorith consists of doing a DFA walk on the source code.
451 * We start in the initial state of the language and then maintain a stack of
452 * states. Transitions are triggered as regular expression matches (as defined
453 * by each state). At every transition we account for the code seen. We keep
454 * track for each line what semantic we've seen so far. Semantics trump each
455 * other in the following order: null < blank < comment < code. As soon as we
456 * see any code, we pretty much ignore other semantics until the newline. We
457 * do, however, keep parsing since we need to maintain the states properly -
458 * in other words, jumping to the newline might make us forget to jump out
459 * of a string state, or something.
462 void parser_parse(ParseResult *pr, char *buffer, int buffer_len, Polyglot *polyglot) {
464 /* to help debug, export MALLOC_TRACE to output file */
468 if (ragel_parser_parse(pr, buffer, buffer_len, polyglot->name))
471 // make sure we have compiled states
472 polyglot_compile_states(polyglot);
474 // setup the parse context
475 ParseContext parse_context;
476 parse_context_initialize(&parse_context, buffer, buffer_len, &polyglot->compiled_states[0]);
481 while ((result = pcre_exec(parse_context_current_pcre(&parse_context), NULL, parse_context.parse_cur, parse_context.parse_left_len, 0, 0, ovector, 30)) >= 0) {
484 log("[ohcount] pcre result: %d\n", result);
485 parser_print_match(&parse_context, ovector, result);
488 // crappy hack work around to solve surprisingly complex bug
489 // its all about the last line - how to avoid attributing twice or not at all
490 // The complexity comes about because sometimes we actually account for it
491 // "automatically" - like when the file ends with a newline. However, when it doesn't
495 // transition if possible (there might not be one if its a newline!)
496 Transition *t = parse_context_get_transition(&parse_context, result - 2); // -1 for pcre_exec storing the entire match first, -1 to be zero-based (pcre is 1-based, kinda)
498 if (t && t->fake_transition) {
499 // fake transition -- just ignore it!
500 log0("- fake transition, still in current state");
502 CompiledState *target_cs = NULL;
503 if (t && t->to_state) {
504 // find the target compiled_state
505 for (target_cs = polyglot->compiled_states; target_cs->state != t->to_state; target_cs++) {}
508 // source or target: who eats the matched string itself?
509 int at = (t == NULL || t->token_eater == FromEatsToken) ? ovector[1] : ovector[0];
511 // attribute the code/text/blanks we've seen so far
512 bool ate_newline = parser_ate_newline(&parse_context, ovector);
513 parse_context_attribute(&parse_context, parse_context.parse_cur + at, ate_newline);
515 // and transit to our new state (note: we usually won't have a transition if we hit a newline)
516 // set the 'at' at the proper place, depending on TokenEater
517 at = (t != NULL && (t->token_eater == ToEatsToken)) ? ovector[0] : ovector[1];
518 if (t && !t->fake_transition) {
519 log("[ohcount] - transition at %d\n", parse_context.parse_cur + at);
520 parse_context_transit(&parse_context, target_cs, parse_context.parse_cur + at);
525 log2("[ohcount] -- starting line %d in state %s\n", parse_context.cur_line_index, parse_context_current_state(&parse_context)->name);
530 int jump_chars = (ovector[0] > ovector[1]) ? ovector[0] : ovector[1];
531 parse_context.parse_left_len -= jump_chars ;
532 parse_context.parse_cur += jump_chars;
536 case PCRE_ERROR_NOMATCH:
537 // attribute what we (might have) eaten so far...
538 if (parse_context.parse_cur[parse_context.parse_left_len - 1] != '\n') {
539 parse_context_attribute(&parse_context, parse_context.parse_cur + parse_context.parse_left_len, true);
542 case PCRE_ERROR_NOMEMORY:
543 die("PCRE_ERROR_NOMEMORY", ERR_PCRE_OUT_OF_MEMORY);
546 die("PCRE ERROR", ERR_PCRE_GENERIC);
550 /* setup the parse result */
551 parse_result_initialize(pr, &parse_context);
562 * internal testing code
565 char buffer[] = "\"str//i\\\"ng\"";
567 int len = strlen(buffer);
568 parser_parse(&pr, buffer, len, POLYGLOTS[0]);
569 printf("parsing this buffer:\n%s\n=============\n", buffer);
570 printf("__code_start__\n%s\n__code_end__\n", pr.language_breakdowns[0].code);
571 printf("__comment_start__\n%s\n__comment_end__\n", pr.language_breakdowns[0].comment);
572 printf("blanks[%d]\n", pr.language_breakdowns[0].blank_count);