usp10: Fix pair adjustment for RTL text.
[wine] / tools / wmc / mcl.c
1 /*
2  * Wine Message Compiler lexical scanner
3  *
4  * Copyright 2000 Bertho A. Stultiens (BS)
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20
21 #include "config.h"
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <ctype.h>
26 #include <assert.h>
27 #include <string.h>
28
29 #include "utils.h"
30 #include "wmc.h"
31 #include "lang.h"
32
33 #include "mcy.tab.h"
34
35 /*
36  * Keywords are case insensitive. All normal input is treated as
37  * being in codepage iso-8859-1 for ascii input files (unicode
38  * page 0) and as equivalent unicode if unicode input is selected.
39  * All normal input, which is not part of a message text, is
40  * enforced to be unicode page 0. Otherwise an error will be
41  * generated. The normal file data should only be ASCII because
42  * that is the basic definition of the grammar.
43  *
44  * Byteorder or unicode input is determined automatically by
45  * reading the first 8 bytes and checking them against unicode
46  * page 0 byteorder (hibyte must be 0).
47  * -- FIXME --
48  * Alternatively, the input is checked against a special byte
49  * sequence to identify the file.
50  * -- FIXME --
51  *
52  *
53  * Keywords:
54  *      Codepages
55  *      Facility
56  *      FacilityNames
57  *      LanguageNames
58  *      MessageId
59  *      MessageIdTypedef
60  *      Severity
61  *      SeverityNames
62  *      SymbolicName
63  *
64  * Default added identifiers for classes:
65  * SeverityNames:
66  *      Success         = 0x0
67  *      Informational   = 0x1
68  *      Warning         = 0x2
69  *      Error           = 0x3
70  * FacilityNames:
71  *      System          = 0x0FF
72  *      Application     = 0xFFF
73  *
74  * The 'Codepages' keyword is a wmc extension.
75  */
76
77 static const WCHAR ustr_application[]   = { 'A', 'p', 'p', 'l', 'i', 'c', 'a', 't', 'i', 'o', 'n', 0 };
78 static const WCHAR ustr_codepages[]     = { 'C', 'o', 'd', 'e', 'p', 'a', 'g', 'e', 's', 0 };
79 static const WCHAR ustr_english[]       = { 'E', 'n', 'g', 'l', 'i', 's', 'h', 0 };
80 static const WCHAR ustr_error[]         = { 'E', 'r', 'r', 'o', 'r', 0 };
81 static const WCHAR ustr_facility[]      = { 'F', 'a', 'c', 'i', 'l', 'i', 't', 'y', 0 };
82 static const WCHAR ustr_facilitynames[] = { 'F', 'a', 'c', 'i', 'l', 'i', 't', 'y', 'N', 'a', 'm', 'e', 's', 0 };
83 static const WCHAR ustr_informational[] = { 'I', 'n', 'f', 'o', 'r', 'm', 'a', 't', 'i', 'o', 'n', 'a', 'l', 0 };
84 static const WCHAR ustr_language[]      = { 'L', 'a', 'n', 'g', 'u', 'a', 'g', 'e', 0};
85 static const WCHAR ustr_languagenames[] = { 'L', 'a', 'n', 'g', 'u', 'a', 'g', 'e', 'N', 'a', 'm', 'e', 's', 0};
86 static const WCHAR ustr_messageid[]     = { 'M', 'e', 's', 's', 'a', 'g', 'e', 'I', 'd', 0 };
87 static const WCHAR ustr_messageidtypedef[] = { 'M', 'e', 's', 's', 'a', 'g', 'e', 'I', 'd', 'T', 'y', 'p', 'e', 'd', 'e', 'f', 0 };
88 static const WCHAR ustr_outputbase[]    = { 'O', 'u', 't', 'p', 'u', 't', 'B', 'a', 's', 'e', 0 };
89 static const WCHAR ustr_severity[]      = { 'S', 'e', 'v', 'e', 'r', 'i', 't', 'y', 0 };
90 static const WCHAR ustr_severitynames[] = { 'S', 'e', 'v', 'e', 'r', 'i', 't', 'y', 'N', 'a', 'm', 'e', 's', 0 };
91 static const WCHAR ustr_success[]       = { 'S', 'u', 'c', 'c', 'e', 's', 's', 0 };
92 static const WCHAR ustr_symbolicname[]  = { 'S', 'y', 'm', 'b', 'o', 'l', 'i', 'c', 'N', 'a', 'm', 'e', 0 };
93 static const WCHAR ustr_system[]        = { 'S', 'y', 's', 't', 'e', 'm', 0 };
94 static const WCHAR ustr_warning[]       = { 'W', 'a', 'r', 'n', 'i', 'n', 'g', 0 };
95 static const WCHAR ustr_msg00001[]      = { 'm', 's', 'g', '0', '0', '0', '0', '1', 0 };
96 /*
97  * This table is to beat any form of "expression building" to check for
98  * correct filename characters. It is also used for ident checks.
99  * FIXME: use it more consistently.
100  */
101
102 #define CH_SHORTNAME    0x01
103 #define CH_LONGNAME     0x02
104 #define CH_IDENT        0x04
105 #define CH_NUMBER       0x08
106 /*#define CH_WILDCARD   0x10*/
107 /*#define CH_DOT        0x20*/
108 #define CH_PUNCT        0x40
109 #define CH_INVALID      0x80
110
111 static const char char_table[256] = {
112         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x00 - 0x07 */
113         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x08 - 0x0F */
114         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x10 - 0x17 */
115         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 0x18 - 0x1F */
116         0x80, 0x03, 0x80, 0x03, 0x03, 0x03, 0x03, 0x03, /* 0x20 - 0x27 " !"#$%&'" */
117         0x43, 0x43, 0x10, 0x80, 0x03, 0x03, 0x22, 0x80, /* 0x28 - 0x2F "()*+,-./" */
118         0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, /* 0x30 - 0x37 "01234567" */
119         0x0b, 0x0b, 0xc0, 0x80, 0x80, 0x80, 0x80, 0x10, /* 0x38 - 0x3F "89:;<=>?" */
120         0x03, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x40 - 0x47 "@ABCDEFG" */
121         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x48 - 0x4F "HIJKLMNO" */
122         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x50 - 0x57 "PQRSTUVW" */
123         0x07, 0x07, 0x07, 0x80, 0x80, 0x80, 0x80, 0x07, /* 0x58 - 0x5F "XYZ[\]^_" */
124         0x03, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x60 - 0x67 "`abcdefg" */
125         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x68 - 0x6F "hijklmno" */
126         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, /* 0x70 - 0x77 "pqrstuvw" */
127         0x07, 0x07, 0x07, 0x03, 0x80, 0x03, 0x03, 0x80, /* 0x78 - 0x7F "xyz{|}~ " */
128         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x80 - 0x87 */
129         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x88 - 0x8F */
130         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x90 - 0x97 */
131         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0x98 - 0x9F */
132         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xA0 - 0xA7 */
133         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xA8 - 0xAF */
134         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xB0 - 0xB7 */
135         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xB8 - 0xBF */
136         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xC0 - 0xC7 */
137         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xC8 - 0xCF */
138         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xD0 - 0xD7 */
139         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xD8 - 0xDF */
140         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xE0 - 0xE7 */
141         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xE8 - 0xEF */
142         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 0xF0 - 0xF7 */
143         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x80, /* 0xF8 - 0xFF */
144 };
145
146 static int isisochar(int ch)
147 {
148         return !(ch & (~0xff));
149 }
150
151 static int codepage;
152 static const union cptable *codepage_def;
153
154 void set_codepage(int cp)
155 {
156         codepage = cp;
157         codepage_def = find_codepage(codepage);
158         if(!codepage_def && codepage != CP_UTF8)
159                 xyyerror("Codepage %d not found; cannot process\n", codepage);
160 }
161
162 /*
163  * Input functions
164  */
165 static int nungetstack = 0;
166 static int allocungetstack = 0;
167 static char *ungetstack = NULL;
168 static int ninputbuffer = 0;
169 static WCHAR *inputbuffer = NULL;
170 static char *xlatebuffer = NULL;
171
172 #define INPUTBUFFER_SIZE        2048    /* Must be larger than 4 and approx. large enough to hold a line */
173
174 /*
175  * Fill the input buffer with *one* line of input.
176  * The line is '\n' terminated so that scanning
177  * messages with translation works as expected
178  * (otherwise we cannot pre-translate because the
179  * language is first known one line before the
180  * actual message).
181  */
182 static int fill_inputbuffer(void)
183 {
184         int n;
185         static const char err_fatalread[] = "Fatal: reading input failed";
186         static int endian = -1;
187
188         if(!inputbuffer)
189         {
190                 inputbuffer = xmalloc(INPUTBUFFER_SIZE*sizeof(WCHAR));
191                 xlatebuffer = xmalloc(INPUTBUFFER_SIZE);
192         }
193
194 try_again:
195         if(!unicodein)
196         {
197                 char *cptr;
198                 cptr = fgets(xlatebuffer, INPUTBUFFER_SIZE, yyin);
199                 if(!cptr && ferror(yyin))
200                         xyyerror(err_fatalread);
201                 else if(!cptr)
202                         return 0;
203                 if (codepage_def)
204                     n = wine_cp_mbstowcs(codepage_def, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE);
205                 else
206                     n = wine_utf8_mbstowcs(0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE);
207                 if(n < 0)
208                         internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n);
209                 if(n <= 1)
210                         goto try_again; /* Should not happen */
211                 n--;    /* Strip added conversion '\0' from input length */
212                 /*
213                  * FIXME:
214                  * Detect UTF-8 in the first time we read some bytes by
215                  * checking the special sequence "FE..." or something like
216                  * that. I need to check www.unicode.org for details.
217                  */
218         }
219         else
220         {
221                 if(endian == -1)
222                 {
223                         n = fread(inputbuffer, 1, 8, yyin);
224                         if(n != 8)
225                         {
226                                 if(!n && ferror(yyin))
227                                         xyyerror(err_fatalread);
228                                 else
229                                         xyyerror("Fatal: file to short to determine byteorder (should never happen)\n");
230                         }
231                         if(isisochar(inputbuffer[0]) &&
232                                 isisochar(inputbuffer[1]) &&
233                                 isisochar(inputbuffer[2]) &&
234                                 isisochar(inputbuffer[3]))
235                         {
236 #ifdef WORDS_BIGENDIAN
237                                 endian = WMC_BO_BIG;
238 #else
239                                 endian = WMC_BO_LITTLE;
240 #endif
241                         }
242                         else if(isisochar(BYTESWAP_WORD(inputbuffer[0])) &&
243                                 isisochar(BYTESWAP_WORD(inputbuffer[1])) &&
244                                 isisochar(BYTESWAP_WORD(inputbuffer[2])) &&
245                                 isisochar(BYTESWAP_WORD(inputbuffer[3])))
246                         {
247 #ifdef WORDS_BIGENDIAN
248                                 endian = WMC_BO_LITTLE;
249 #else
250                                 endian = WMC_BO_BIG;
251 #endif
252                         }
253                         else
254                                 xyyerror("Fatal: cannot determine file's byteorder\n");
255                         /* FIXME:
256                          * Determine the file-endian with the leader-bytes
257                          * "FF FE..."; can't remember the exact sequence.
258                          */
259                         n /= 2;
260 #ifdef WORDS_BIGENDIAN
261                         if(endian == WMC_BO_LITTLE)
262 #else
263                         if(endian == WMC_BO_BIG)
264 #endif
265                         {
266                                 inputbuffer[0] = BYTESWAP_WORD(inputbuffer[0]);
267                                 inputbuffer[1] = BYTESWAP_WORD(inputbuffer[1]);
268                                 inputbuffer[2] = BYTESWAP_WORD(inputbuffer[2]);
269                                 inputbuffer[3] = BYTESWAP_WORD(inputbuffer[3]);
270                         }
271
272                 }
273                 else
274                 {
275                         int i;
276                         n = 0;
277                         for(i = 0; i < INPUTBUFFER_SIZE; i++)
278                         {
279                                 int t;
280                                 t = fread(&inputbuffer[i], 2, 1, yyin);
281                                 if(!t && ferror(yyin))
282                                         xyyerror(err_fatalread);
283                                 else if(!t && n)
284                                         break;
285                                 n++;
286 #ifdef WORDS_BIGENDIAN
287                                 if(endian == WMC_BO_LITTLE)
288 #else
289                                 if(endian == WMC_BO_BIG)
290 #endif
291                                 {
292                                         if((inputbuffer[i] = BYTESWAP_WORD(inputbuffer[i])) == '\n')
293                                                 break;
294                                 }
295                                 else
296                                 {
297                                         if(inputbuffer[i] == '\n')
298                                                 break;
299                                 }
300                         }
301                 }
302
303         }
304
305         if(!n)
306         {
307                 mcy_warning("Re-read line (input was or converted to zilch)\n");
308                 goto try_again; /* Should not happen, but could be due to stdin reading and a signal */
309         }
310
311         ninputbuffer += n;
312         return 1;
313 }
314
315 static int get_unichar(void)
316 {
317         static WCHAR *b = NULL;
318         char_number++;
319
320         if(nungetstack)
321                 return ungetstack[--nungetstack];
322
323         if(!ninputbuffer)
324         {
325                 if(!fill_inputbuffer())
326                         return EOF;
327                 b = inputbuffer;
328         }
329
330         ninputbuffer--;
331         return (int)(*b++ & 0xffff);
332 }
333
334 static void unget_unichar(int ch)
335 {
336         if(ch == EOF)
337                 return;
338
339         char_number--;
340
341         if(nungetstack == allocungetstack)
342         {
343                 allocungetstack += 32;
344                 ungetstack = xrealloc(ungetstack, allocungetstack * sizeof(*ungetstack));
345         }
346
347         ungetstack[nungetstack++] = (WCHAR)ch;
348 }
349
350
351 /*
352  * Normal character stack.
353  * Used for number scanning.
354  */
355 static int ncharstack = 0;
356 static int alloccharstack = 0;
357 static char *charstack = NULL;
358
359 static void empty_char_stack(void)
360 {
361         ncharstack = 0;
362 }
363
364 static void push_char(int ch)
365 {
366         if(ncharstack == alloccharstack)
367         {
368                 alloccharstack += 32;
369                 charstack = xrealloc(charstack, alloccharstack * sizeof(*charstack));
370         }
371         charstack[ncharstack++] = (char)ch;
372 }
373
374 static int tos_char_stack(void)
375 {
376         if(!ncharstack)
377                 return 0;
378         else
379                 return (int)(charstack[ncharstack-1] & 0xff);
380 }
381
382 static char *get_char_stack(void)
383 {
384         return charstack;
385 }
386
387 /*
388  * Unicode character stack.
389  * Used for general scanner.
390  */
391 static int nunicharstack = 0;
392 static int allocunicharstack = 0;
393 static WCHAR *unicharstack = NULL;
394
395 static void empty_unichar_stack(void)
396 {
397         nunicharstack = 0;
398 }
399
400 static void push_unichar(int ch)
401 {
402         if(nunicharstack == allocunicharstack)
403         {
404                 allocunicharstack += 128;
405                 unicharstack = xrealloc(unicharstack, allocunicharstack * sizeof(*unicharstack));
406         }
407         unicharstack[nunicharstack++] = (WCHAR)ch;
408 }
409
410 #if 0
411 static int tos_unichar_stack(void)
412 {
413         if(!nunicharstack)
414                 return 0;
415         else
416                 return (int)(unicharstack[nunicharstack-1] & 0xffff);
417 }
418 #endif
419
420 static WCHAR *get_unichar_stack(void)
421 {
422         return unicharstack;
423 }
424
425 /*
426  * Number scanner
427  *
428  * state |      ch         | next state
429  * ------+-----------------+--------------------------
430  *   0   | [0]             | 1
431  *   0   | [1-9]           | 4
432  *   0   | .               | error (should never occur)
433  *   1   | [xX]            | 2
434  *   1   | [0-7]           | 3
435  *   1   | [89a-wyzA-WYZ_] | error invalid digit
436  *   1   | .               | return 0
437  *   2   | [0-9a-fA-F]     | 2
438  *   2   | [g-zG-Z_]       | error invalid hex digit
439  *   2   | .               | return (hex-number) if TOS != [xX] else error
440  *   3   | [0-7]           | 3
441  *   3   | [89a-zA-Z_]     | error invalid octal digit
442  *   3   | .               | return (octal-number)
443  *   4   | [0-9]           | 4
444  *   4   | [a-zA-Z_]       | error invalid decimal digit
445  *   4   | .               | return (decimal-number)
446  *
447  * All non-identifier characters [^a-zA-Z_0-9] terminate the scan
448  * and return the value. This is not entirely correct, but close
449  * enough (should check punctuators as trailing context, but the
450  * char_table is not adapted to that and it is questionable whether
451  * it is worth the trouble).
452  * All non-iso-8859-1 characters are an error.
453  */
454 static int scan_number(int ch)
455 {
456         int state = 0;
457         int base = 10;
458         empty_char_stack();
459
460         while(1)
461         {
462                 if(!isisochar(ch))
463                         xyyerror("Invalid digit\n");
464
465                 switch(state)
466                 {
467                 case 0:
468                         if(isdigit(ch))
469                         {
470                                 push_char(ch);
471                                 if(ch == '0')
472                                         state = 1;
473                                 else
474                                         state = 4;
475                         }
476                         else
477                                 internal_error(__FILE__, __LINE__, "Non-digit in first number-scanner state\n");
478                         break;
479                 case 1:
480                         if(ch == 'x' || ch == 'X')
481                         {
482                                 push_char(ch);
483                                 state = 2;
484                         }
485                         else if(ch >= '0' && ch <= '7')
486                         {
487                                 push_char(ch);
488                                 state = 3;
489                         }
490                         else if(isalpha(ch) || ch == '_')
491                                 xyyerror("Invalid number digit\n");
492                         else
493                         {
494                                 unget_unichar(ch);
495                                 mcy_lval.num = 0;
496                                 return tNUMBER;
497                         }
498                         break;
499                 case 2:
500                         if(isxdigit(ch))
501                                 push_char(ch);
502                         else if(isalpha(ch) || ch == '_' || !isxdigit(tos_char_stack()))
503                                 xyyerror("Invalid hex digit\n");
504                         else
505                         {
506                                 base = 16;
507                                 goto finish;
508                         }
509                         break;
510                 case 3:
511                         if(ch >= '0' && ch <= '7')
512                                 push_char(ch);
513                         else if(isalnum(ch) || ch == '_')
514                                 xyyerror("Invalid octal digit\n");
515                         else
516                         {
517                                 base = 8;
518                                 goto finish;
519                         }
520                         break;
521                 case 4:
522                         if(isdigit(ch))
523                                 push_char(ch);
524                         else if(isalnum(ch) || ch == '_')
525                                 xyyerror("Invalid decimal digit\n");
526                         else
527                         {
528                                 base = 10;
529                                 goto finish;
530                         }
531                         break;
532                 default:
533                         internal_error(__FILE__, __LINE__, "Invalid state in number-scanner\n");
534                 }
535                 ch = get_unichar();
536         }
537 finish:
538         unget_unichar(ch);
539         push_char(0);
540         mcy_lval.num = strtoul(get_char_stack(), NULL, base);
541         return tNUMBER;
542 }
543
544 static void newline(void)
545 {
546         line_number++;
547         char_number = 1;
548 }
549
550 static int unisort(const void *p1, const void *p2)
551 {
552         return unistricmp(((const token_t *)p1)->name, ((const token_t *)p2)->name);
553 }
554
555 static token_t *tokentable = NULL;
556 static int ntokentable = 0;
557
558 token_t *lookup_token(const WCHAR *s)
559 {
560         token_t tok;
561
562         tok.name = s;
563         return (token_t *)bsearch(&tok, tokentable, ntokentable, sizeof(*tokentable), unisort);
564 }
565
566 void add_token(tok_e type, const WCHAR *name, int tok, int cp, const WCHAR *alias, int fix)
567 {
568         ntokentable++;
569         tokentable = xrealloc(tokentable, ntokentable * sizeof(*tokentable));
570         tokentable[ntokentable-1].type = type;
571         tokentable[ntokentable-1].name = name;
572         tokentable[ntokentable-1].token = tok;
573         tokentable[ntokentable-1].codepage = cp;
574         tokentable[ntokentable-1].alias = alias;
575         tokentable[ntokentable-1].fixed = fix;
576         qsort(tokentable, ntokentable, sizeof(*tokentable), unisort);
577 }
578
579 void get_tokentable(token_t **tab, int *len)
580 {
581         assert(tab != NULL);
582         assert(len != NULL);
583         *tab = tokentable;
584         *len = ntokentable;
585 }
586
587 /*
588  * The scanner
589  *
590  */
591 int mcy_lex(void)
592 {
593         static const WCHAR ustr_dot1[] = { '.', '\n', 0 };
594         static const WCHAR ustr_dot2[] = { '.', '\r', '\n', 0 };
595         static int isinit = 0;
596         int ch;
597
598         if(!isinit)
599         {
600                 isinit++;
601                 set_codepage(WMC_DEFAULT_CODEPAGE);
602                 add_token(tok_keyword,  ustr_codepages,         tCODEPAGE,      0, NULL, 0);
603                 add_token(tok_keyword,  ustr_facility,          tFACILITY,      0, NULL, 1);
604                 add_token(tok_keyword,  ustr_facilitynames,     tFACNAMES,      0, NULL, 1);
605                 add_token(tok_keyword,  ustr_language,          tLANGUAGE,      0, NULL, 1);
606                 add_token(tok_keyword,  ustr_languagenames,     tLANNAMES,      0, NULL, 1);
607                 add_token(tok_keyword,  ustr_messageid,         tMSGID,         0, NULL, 1);
608                 add_token(tok_keyword,  ustr_messageidtypedef,  tTYPEDEF,       0, NULL, 1);
609                 add_token(tok_keyword,  ustr_outputbase,        tBASE,          0, NULL, 1);
610                 add_token(tok_keyword,  ustr_severity,          tSEVERITY,      0, NULL, 1);
611                 add_token(tok_keyword,  ustr_severitynames,     tSEVNAMES,      0, NULL, 1);
612                 add_token(tok_keyword,  ustr_symbolicname,      tSYMNAME,       0, NULL, 1);
613                 add_token(tok_severity, ustr_error,             0x03,           0, NULL, 0);
614                 add_token(tok_severity, ustr_warning,           0x02,           0, NULL, 0);
615                 add_token(tok_severity, ustr_informational,     0x01,           0, NULL, 0);
616                 add_token(tok_severity, ustr_success,           0x00,           0, NULL, 0);
617                 add_token(tok_facility, ustr_application,       0xFFF,          0, NULL, 0);
618                 add_token(tok_facility, ustr_system,            0x0FF,          0, NULL, 0);
619                 add_token(tok_language, ustr_english,           0x409,          437, ustr_msg00001, 0);
620         }
621
622         empty_unichar_stack();
623
624         while(1)
625         {
626                 if(want_line)
627                 {
628                         while((ch = get_unichar()) != '\n')
629                         {
630                                 if(ch == EOF)
631                                         xyyerror("Unexpected EOF\n");
632                                 push_unichar(ch);
633                         }
634                         newline();
635                         push_unichar(ch);
636                         push_unichar(0);
637                         if(!unistrcmp(ustr_dot1, get_unichar_stack()) || !unistrcmp(ustr_dot2, get_unichar_stack()))
638                         {
639                                 want_line = 0;
640                                 /* Reset the codepage to our default after each message */
641                                 set_codepage(WMC_DEFAULT_CODEPAGE);
642                                 return tMSGEND;
643                         }
644                         mcy_lval.str = xunistrdup(get_unichar_stack());
645                         return tLINE;
646                 }
647
648                 ch = get_unichar();
649
650                 if(ch == EOF)
651                         return EOF;
652
653                 if(ch == '\n')
654                 {
655                         newline();
656                         if(want_nl)
657                         {
658                                 want_nl = 0;
659                                 return tNL;
660                         }
661                         continue;
662                 }
663
664                 if(isisochar(ch))
665                 {
666                         if(want_file)
667                         {
668                                 int n = 0;
669                                 while(n < 8 && isisochar(ch))
670                                 {
671                                         int t = char_table[ch];
672                                         if((t & CH_PUNCT) || !(t & CH_SHORTNAME))
673                                                 break;
674
675                                         push_unichar(ch);
676                                         n++;
677                                         ch = get_unichar();
678                                 }
679                                 unget_unichar(ch);
680                                 push_unichar(0);
681                                 want_file = 0;
682                                 mcy_lval.str = xunistrdup(get_unichar_stack());
683                                 return tFILE;
684                         }
685
686                         if(char_table[ch] & CH_IDENT)
687                         {
688                                 token_t *tok;
689                                 while(isisochar(ch) && (char_table[ch] & (CH_IDENT|CH_NUMBER)))
690                                 {
691                                         push_unichar(ch);
692                                         ch = get_unichar();
693                                 }
694                                 unget_unichar(ch);
695                                 push_unichar(0);
696                                 if(!(tok = lookup_token(get_unichar_stack())))
697                                 {
698                                         mcy_lval.str = xunistrdup(get_unichar_stack());
699                                         return tIDENT;
700                                 }
701                                 switch(tok->type)
702                                 {
703                                 case tok_keyword:
704                                         return tok->token;
705
706                                 case tok_language:
707                                         codepage = tok->codepage;
708                                         /* Fall through */
709                                 case tok_severity:
710                                 case tok_facility:
711                                         mcy_lval.tok = tok;
712                                         return tTOKEN;
713
714                                 default:
715                                         internal_error(__FILE__, __LINE__, "Invalid token type encountered\n");
716                                 }
717                         }
718
719                         if(isspace(ch)) /* Ignore space */
720                                 continue;
721
722                         if(isdigit(ch))
723                                 return scan_number(ch);
724                 }
725
726                 switch(ch)
727                 {
728                 case ':':
729                 case '=':
730                 case '+':
731                 case '(':
732                 case ')':
733                         return ch;
734                 case ';':
735                         while(ch != '\n' && ch != EOF)
736                         {
737                                 push_unichar(ch);
738                                 ch = get_unichar();
739                         }
740                         newline();
741                         push_unichar(ch);       /* Include the newline */
742                         push_unichar(0);
743                         mcy_lval.str = xunistrdup(get_unichar_stack());
744                         return tCOMMENT;
745                 default:
746                         xyyerror("Invalid character '%c' (0x%04x)\n", isisochar(ch) && isprint(ch) ? ch : '.', ch);
747                 }
748         }
749 }