1 // html.rl written by Mitchell Foral. mitchell<att>caladbolg<dott>net.
3 /************************* Required for every parser *************************/
4 #ifndef OHCOUNT_HTML_PARSER_H
5 #define OHCOUNT_HTML_PARSER_H
7 #include "../parser_macros.h"
9 // the name of the language
10 const char *HTML_LANG = LANG_HTML;
12 // the languages entities
13 const char *html_entities[] = {
14 "space", "comment", "doctype",
15 "tag", "entity", "any"
18 // constants associated with the entities
20 HTML_SPACE = 0, HTML_COMMENT, HTML_DOCTYPE,
21 HTML_TAG, HTML_ENTITY, HTML_ANY
24 /*****************************************************************************/
27 #include "javascript.h"
32 include common "common.rl";
36 # Line counting machine
38 action html_ccallback {
47 std_internal_newline(HTML_LANG)
50 std_newline(HTML_LANG)
52 case CHECK_BLANK_ENTRY:
53 check_blank_entry(HTML_LANG)
59 newline %{ entity = INTERNAL_NL; } %html_ccallback
63 (nonnewline - ws) @comment
66 html_sq_str = '\'' ([^\r\n\f'\\] | '\\' nonnewline)* '\'' @code;
67 html_dq_str = '"' ([^\r\n\f"\\] | '\\' nonnewline)* '"' @code;
68 html_string = html_sq_str | html_dq_str;
70 ws_or_inl = (ws | newline @{ entity = INTERNAL_NL; } %html_ccallback);
72 html_css_entry = '<' /style/i [^>]+ :>> 'text/css' [^>]+ '>' @code;
73 html_css_outry = '</' /style/i ws_or_inl* '>' @check_blank_outry @code;
75 html_css_outry @{ p = ts; fret; };
76 # unmodified CSS patterns
77 spaces ${ entity = CSS_SPACE; } => css_ccallback;
80 newline ${ entity = NEWLINE; } => css_ccallback;
81 ^space ${ entity = CSS_ANY; } => css_ccallback;
84 html_js_entry = '<' /script/i [^>]+ :>> 'text/javascript' [^>]+ '>' @code;
85 html_js_outry = '</' /script/i ws_or_inl* '>' @check_blank_outry @code;
87 html_js_outry @{ p = ts; fret; };
88 # unmodified Javascript patterns
89 spaces ${ entity = JS_SPACE; } => js_ccallback;
92 newline ${ entity = NEWLINE; } => js_ccallback;
93 ^space ${ entity = JS_ANY; } => js_ccallback;
97 html_css_entry @{ entity = CHECK_BLANK_ENTRY; } @html_ccallback
98 @{ saw(CSS_LANG); } => { fcall html_css_line; };
99 html_js_entry @{ entity = CHECK_BLANK_ENTRY; } @html_ccallback
100 @{ saw(JS_LANG); } => { fcall html_js_line; };
101 # standard HTML patterns
102 spaces ${ entity = HTML_SPACE; } => html_ccallback;
105 newline ${ entity = NEWLINE; } => html_ccallback;
106 ^space ${ entity = HTML_ANY; } => html_ccallback;
111 action html_ecallback {
112 callback(HTML_LANG, html_entities[entity], cint(ts), cint(te), userdata);
115 html_css_entry_entity = '<' /style/i [^>]+ :>> 'text/css' [^>]+ '>';
116 html_css_outry_entity = '</' /style/i ws_or_inl* '>';
117 html_css_entity := |*
118 html_css_outry_entity @{ fret; };
119 # unmodified CSS patterns
120 space+ ${ entity = CSS_SPACE; } => css_ecallback;
121 css_comment_entity ${ entity = CSS_COMMENT; } => css_ecallback;
126 html_js_entry_entity = '<' /script/i [^>]+ :>> 'text/javascript' [^>]+ '>';
127 html_js_outry_entity = '</' /script/i ws_or_inl* '>';
129 html_js_outry_entity @{ fret; };
130 # unmodified Javascript patterns
131 space+ ${ entity = JS_SPACE; } => js_ecallback;
132 js_comment_entity ${ entity = JS_COMMENT; } => js_ecallback;
137 html_comment_entity = '<!--' any* :>> '-->';
140 # TODO: html_ecallback for html_*_{entry,outry}_entity
141 html_css_entry_entity => { fcall html_css_entity; };
142 html_js_entry_entity => { fcall html_js_entity; };
143 # standard HTML patterns
144 space+ ${ entity = HTML_SPACE; } => html_ecallback;
145 html_comment_entity ${ entity = HTML_COMMENT; } => html_ecallback;
151 /************************* Required for every parser *************************/
153 /* Parses a string buffer with HTML markup.
155 * @param *buffer The string to parse.
156 * @param length The length of the string to parse.
157 * @param count Integer flag specifying whether or not to count lines. If yes,
158 * uses the Ragel machine optimized for counting. Otherwise uses the Ragel
159 * machine optimized for returning entity positions.
160 * @param *callback Callback function. If count is set, callback is called for
161 * every line of code, comment, or blank with 'lcode', 'lcomment', and
162 * 'lblank' respectively. Otherwise callback is called for each entity found.
164 void parse_html(char *buffer, int length, int count,
165 void (*callback) (const char *lang, const char *entity, int s,
172 cs = (count) ? html_en_html_line : html_en_html_entity;
175 // if no newline at EOF; callback contents of last line
176 if (count) { process_last_line(HTML_LANG) }
181 /*****************************************************************************/