Commit | Line | Data |
---|---|---|
6386edc5 MM |
1 | /* |
2 | ** 2001 September 15 | |
3 | ** | |
4 | ** The author disclaims copyright to this source code. In place of | |
5 | ** a legal notice, here is a blessing: | |
6 | ** | |
7 | ** May you do good and not evil. | |
8 | ** May you find forgiveness for yourself and forgive others. | |
9 | ** May you share freely, never taking more than you give. | |
10 | ** | |
11 | ************************************************************************* | |
12 | ** An tokenizer for SQL | |
13 | ** | |
14 | ** This file contains C code that splits an SQL input string up into | |
15 | ** individual tokens and sends those tokens one-by-one over to the | |
16 | ** parser for analysis. | |
17 | */ | |
18 | ||
6386edc5 | 19 | #include <ctype.h> |
e37c6e18 | 20 | #include <stdarg.h> |
6386edc5 | 21 | #include <stdlib.h> |
e37c6e18 AJ |
22 | |
23 | #include "windef.h" | |
24 | #include "winbase.h" | |
5c850c6a | 25 | #include "wine/unicode.h" |
6386edc5 | 26 | #include "query.h" |
af54ef9c | 27 | #include "sql.tab.h" |
6386edc5 | 28 | |
6386edc5 MM |
29 | /* |
30 | ** All the keywords of the SQL language are stored as in a hash | |
31 | ** table composed of instances of the following structure. | |
32 | */ | |
33 | typedef struct Keyword Keyword; | |
34 | struct Keyword { | |
5c850c6a | 35 | const WCHAR *zName; /* The keyword name */ |
6386edc5 MM |
36 | int tokenType; /* The token value for this keyword */ |
37 | }; | |
38 | ||
f9042ec9 MM |
39 | #define MAX_TOKEN_LEN 11 |
40 | ||
0fd733bf | 41 | static const WCHAR ADD_W[] = { 'A','D','D',0 }; |
57443e01 | 42 | static const WCHAR ALTER_W[] = { 'A','L','T','E','R',0 }; |
5c850c6a | 43 | static const WCHAR AND_W[] = { 'A','N','D',0 }; |
5c850c6a | 44 | static const WCHAR BY_W[] = { 'B','Y',0 }; |
5c850c6a JL |
45 | static const WCHAR CHAR_W[] = { 'C','H','A','R',0 }; |
46 | static const WCHAR CHARACTER_W[] = { 'C','H','A','R','A','C','T','E','R',0 }; | |
5c850c6a | 47 | static const WCHAR CREATE_W[] = { 'C','R','E','A','T','E',0 }; |
5c850c6a | 48 | static const WCHAR DELETE_W[] = { 'D','E','L','E','T','E',0 }; |
5c850c6a | 49 | static const WCHAR DISTINCT_W[] = { 'D','I','S','T','I','N','C','T',0 }; |
62c544cf | 50 | static const WCHAR DROP_W[] = { 'D','R','O','P',0 }; |
399321c4 | 51 | static const WCHAR FREE_W[] = { 'F','R','E','E',0 }; |
5c850c6a | 52 | static const WCHAR FROM_W[] = { 'F','R','O','M',0 }; |
5c850c6a | 53 | static const WCHAR HOLD_W[] = { 'H','O','L','D',0 }; |
5c850c6a | 54 | static const WCHAR INSERT_W[] = { 'I','N','S','E','R','T',0 }; |
5c850c6a | 55 | static const WCHAR INT_W[] = { 'I','N','T',0 }; |
88580113 | 56 | static const WCHAR INTEGER_W[] = { 'I','N','T','E','G','E','R',0 }; |
5c850c6a JL |
57 | static const WCHAR INTO_W[] = { 'I','N','T','O',0 }; |
58 | static const WCHAR IS_W[] = { 'I','S',0 }; | |
5c850c6a | 59 | static const WCHAR KEY_W[] = { 'K','E','Y',0 }; |
5c850c6a | 60 | static const WCHAR LIKE_W[] = { 'L','I','K','E',0 }; |
5c850c6a JL |
61 | static const WCHAR LOCALIZABLE_W[] = { 'L','O','C','A','L','I','Z','A','B','L','E',0 }; |
62 | static const WCHAR LONG_W[] = { 'L','O','N','G',0 }; | |
63 | static const WCHAR LONGCHAR_W[] = { 'L','O','N','G','C','H','A','R',0 }; | |
5c850c6a | 64 | static const WCHAR NOT_W[] = { 'N','O','T',0 }; |
5c850c6a JL |
65 | static const WCHAR NULL_W[] = { 'N','U','L','L',0 }; |
66 | static const WCHAR OBJECT_W[] = { 'O','B','J','E','C','T',0 }; | |
5c850c6a JL |
67 | static const WCHAR OR_W[] = { 'O','R',0 }; |
68 | static const WCHAR ORDER_W[] = { 'O','R','D','E','R',0 }; | |
5c850c6a | 69 | static const WCHAR PRIMARY_W[] = { 'P','R','I','M','A','R','Y',0 }; |
5c850c6a JL |
70 | static const WCHAR SELECT_W[] = { 'S','E','L','E','C','T',0 }; |
71 | static const WCHAR SET_W[] = { 'S','E','T',0 }; | |
72 | static const WCHAR SHORT_W[] = { 'S','H','O','R','T',0 }; | |
5c850c6a | 73 | static const WCHAR TABLE_W[] = { 'T','A','B','L','E',0 }; |
5c850c6a | 74 | static const WCHAR TEMPORARY_W[] = { 'T','E','M','P','O','R','A','R','Y',0 }; |
5c850c6a | 75 | static const WCHAR UPDATE_W[] = { 'U','P','D','A','T','E',0 }; |
5c850c6a | 76 | static const WCHAR VALUES_W[] = { 'V','A','L','U','E','S',0 }; |
5c850c6a JL |
77 | static const WCHAR WHERE_W[] = { 'W','H','E','R','E',0 }; |
78 | ||
6386edc5 MM |
79 | /* |
80 | ** These are the keywords | |
d2364d5d | 81 | ** They MUST be in alphabetical order |
6386edc5 MM |
82 | */ |
83 | static const Keyword aKeywordTable[] = { | |
0fd733bf | 84 | { ADD_W, TK_ADD }, |
57443e01 | 85 | { ALTER_W, TK_ALTER }, |
5c850c6a | 86 | { AND_W, TK_AND }, |
5c850c6a | 87 | { BY_W, TK_BY }, |
5c850c6a JL |
88 | { CHAR_W, TK_CHAR }, |
89 | { CHARACTER_W, TK_CHAR }, | |
5c850c6a | 90 | { CREATE_W, TK_CREATE }, |
5c850c6a | 91 | { DELETE_W, TK_DELETE }, |
5c850c6a | 92 | { DISTINCT_W, TK_DISTINCT }, |
62c544cf | 93 | { DROP_W, TK_DROP }, |
57443e01 | 94 | { FREE_W, TK_FREE }, |
5c850c6a | 95 | { FROM_W, TK_FROM }, |
5c850c6a | 96 | { HOLD_W, TK_HOLD }, |
5c850c6a | 97 | { INSERT_W, TK_INSERT }, |
5c850c6a | 98 | { INT_W, TK_INT }, |
88580113 | 99 | { INTEGER_W, TK_INT }, |
5c850c6a JL |
100 | { INTO_W, TK_INTO }, |
101 | { IS_W, TK_IS }, | |
5c850c6a | 102 | { KEY_W, TK_KEY }, |
5c850c6a | 103 | { LIKE_W, TK_LIKE }, |
5c850c6a JL |
104 | { LOCALIZABLE_W, TK_LOCALIZABLE }, |
105 | { LONG_W, TK_LONG }, | |
106 | { LONGCHAR_W, TK_LONGCHAR }, | |
5c850c6a | 107 | { NOT_W, TK_NOT }, |
5c850c6a JL |
108 | { NULL_W, TK_NULL }, |
109 | { OBJECT_W, TK_OBJECT }, | |
5c850c6a JL |
110 | { OR_W, TK_OR }, |
111 | { ORDER_W, TK_ORDER }, | |
5c850c6a | 112 | { PRIMARY_W, TK_PRIMARY }, |
5c850c6a JL |
113 | { SELECT_W, TK_SELECT }, |
114 | { SET_W, TK_SET }, | |
115 | { SHORT_W, TK_SHORT }, | |
5c850c6a | 116 | { TABLE_W, TK_TABLE }, |
77d3c598 | 117 | { TEMPORARY_W, TK_TEMPORARY }, |
5c850c6a | 118 | { UPDATE_W, TK_UPDATE }, |
5c850c6a | 119 | { VALUES_W, TK_VALUES }, |
5c850c6a | 120 | { WHERE_W, TK_WHERE }, |
6386edc5 MM |
121 | }; |
122 | ||
123 | #define KEYWORD_COUNT ( sizeof aKeywordTable/sizeof (Keyword) ) | |
124 | ||
f9042ec9 MM |
125 | /* |
126 | ** Comparison function for binary search. | |
127 | */ | |
128 | static int compKeyword(const void *m1, const void *m2){ | |
129 | const Keyword *k1 = m1, *k2 = m2; | |
130 | ||
131 | return strcmpiW( k1->zName, k2->zName ); | |
132 | } | |
133 | ||
6386edc5 MM |
134 | /* |
135 | ** This function looks up an identifier to determine if it is a | |
136 | ** keyword. If it is a keyword, the token code of that keyword is | |
137 | ** returned. If the input is not a keyword, TK_ID is returned. | |
138 | */ | |
b58a098a | 139 | static int sqliteKeywordCode(const WCHAR *z, int n){ |
f9042ec9 MM |
140 | WCHAR str[MAX_TOKEN_LEN+1]; |
141 | Keyword key, *r; | |
142 | ||
143 | if( n>MAX_TOKEN_LEN ) | |
144 | return TK_ID; | |
6386edc5 | 145 | |
f9042ec9 MM |
146 | memcpy( str, z, n*sizeof (WCHAR) ); |
147 | str[n] = 0; | |
148 | key.tokenType = 0; | |
149 | key.zName = str; | |
150 | r = bsearch( &key, aKeywordTable, KEYWORD_COUNT, sizeof (Keyword), compKeyword ); | |
151 | if( r ) | |
152 | return r->tokenType; | |
6386edc5 MM |
153 | return TK_ID; |
154 | } | |
155 | ||
156 | ||
157 | /* | |
158 | ** If X is a character that can be used in an identifier then | |
159 | ** isIdChar[X] will be 1. Otherwise isIdChar[X] will be 0. | |
160 | ** | |
161 | ** In this implementation, an identifier can be a string of | |
162 | ** alphabetic characters, digits, and "_" plus any character | |
163 | ** with the high-order bit set. The latter rule means that | |
164 | ** any sequence of UTF-8 characters or characters taken from | |
165 | ** an extended ISO8859 character set can form an identifier. | |
166 | */ | |
167 | static const char isIdChar[] = { | |
168 | /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ | |
f80f1cc2 | 169 | 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ |
6386edc5 | 170 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ |
f80f1cc2 | 171 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, /* 2x */ |
6386edc5 MM |
172 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ |
173 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ | |
174 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ | |
175 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ | |
176 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ | |
177 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 8x */ | |
178 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 9x */ | |
179 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Ax */ | |
180 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Bx */ | |
181 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Cx */ | |
182 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Dx */ | |
183 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Ex */ | |
184 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Fx */ | |
185 | }; | |
186 | ||
187 | ||
188 | /* | |
189 | ** Return the length of the token that begins at z[0]. Return | |
190 | ** -1 if the token is (or might be) incomplete. Store the token | |
191 | ** type in *tokenType before returning. | |
192 | */ | |
193 | int sqliteGetToken(const WCHAR *z, int *tokenType){ | |
194 | int i; | |
195 | switch( *z ){ | |
2f1eacfc JH |
196 | case ' ': case '\t': case '\n': case '\f': { |
197 | for(i=1; isspace(z[i]) && z[i] != '\r'; i++){} | |
6386edc5 MM |
198 | *tokenType = TK_SPACE; |
199 | return i; | |
200 | } | |
201 | case '-': { | |
202 | if( z[1]==0 ) return -1; | |
6386edc5 MM |
203 | *tokenType = TK_MINUS; |
204 | return 1; | |
205 | } | |
2b2953c6 MM |
206 | case '(': |
207 | *tokenType = TK_LP; | |
6386edc5 | 208 | return 1; |
2b2953c6 MM |
209 | case ')': |
210 | *tokenType = TK_RP; | |
6386edc5 | 211 | return 1; |
2b2953c6 | 212 | case '*': |
6386edc5 MM |
213 | *tokenType = TK_STAR; |
214 | return 1; | |
2b2953c6 | 215 | case '=': |
6386edc5 | 216 | *tokenType = TK_EQ; |
2b2953c6 | 217 | return 1; |
6386edc5 MM |
218 | case '<': { |
219 | if( z[1]=='=' ){ | |
220 | *tokenType = TK_LE; | |
221 | return 2; | |
222 | }else if( z[1]=='>' ){ | |
223 | *tokenType = TK_NE; | |
224 | return 2; | |
6386edc5 MM |
225 | }else{ |
226 | *tokenType = TK_LT; | |
227 | return 1; | |
228 | } | |
229 | } | |
230 | case '>': { | |
231 | if( z[1]=='=' ){ | |
232 | *tokenType = TK_GE; | |
233 | return 2; | |
6386edc5 MM |
234 | }else{ |
235 | *tokenType = TK_GT; | |
236 | return 1; | |
237 | } | |
238 | } | |
239 | case '!': { | |
240 | if( z[1]!='=' ){ | |
241 | *tokenType = TK_ILLEGAL; | |
242 | return 2; | |
243 | }else{ | |
244 | *tokenType = TK_NE; | |
245 | return 2; | |
246 | } | |
247 | } | |
2b2953c6 | 248 | case '?': |
ab519f2a MM |
249 | *tokenType = TK_WILDCARD; |
250 | return 1; | |
2b2953c6 | 251 | case ',': |
6386edc5 MM |
252 | *tokenType = TK_COMMA; |
253 | return 1; | |
08d1398d | 254 | case '`': case '\'': { |
6386edc5 MM |
255 | int delim = z[0]; |
256 | for(i=1; z[i]; i++){ | |
7f3faffc JH |
257 | if( z[i]==delim ) |
258 | break; | |
6386edc5 MM |
259 | } |
260 | if( z[i] ) i++; | |
d31f1296 MM |
261 | if( delim == '`' ) |
262 | *tokenType = TK_ID; | |
263 | else | |
264 | *tokenType = TK_STRING; | |
6386edc5 MM |
265 | return i; |
266 | } | |
267 | case '.': { | |
268 | if( !isdigit(z[1]) ){ | |
269 | *tokenType = TK_DOT; | |
270 | return 1; | |
271 | } | |
272 | /* Fall thru into the next case */ | |
273 | } | |
274 | case '0': case '1': case '2': case '3': case '4': | |
275 | case '5': case '6': case '7': case '8': case '9': { | |
276 | *tokenType = TK_INTEGER; | |
277 | for(i=1; isdigit(z[i]); i++){} | |
6386edc5 MM |
278 | return i; |
279 | } | |
280 | case '[': { | |
281 | for(i=1; z[i] && z[i-1]!=']'; i++){} | |
282 | *tokenType = TK_ID; | |
283 | return i; | |
284 | } | |
285 | default: { | |
286 | if( !isIdChar[*z] ){ | |
287 | break; | |
288 | } | |
289 | for(i=1; isIdChar[z[i]]; i++){} | |
290 | *tokenType = sqliteKeywordCode(z, i); | |
291 | return i; | |
292 | } | |
293 | } | |
294 | *tokenType = TK_ILLEGAL; | |
295 | return 1; | |
296 | } |