/* * File: utf2any.l * * (c) Peter Kleiweg 2000 * * This is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, * or (at your option) any later version. * * Compile: * flex -B -8 utf2any.l * gcc -s -Wall -o utf2any lex.yy.c -lfl * rm lex.yy.c * */ %{ #define UTFanyVERSION "1.0" /* * MAPDIR is the directory were symbol maps are searched. * This should be a path, ending with a slash, surrounded by double quotes, * or it should be NULL. */ #ifndef MAPDIR # ifdef __MSDOS__ # define MAPDIR "c:\\utf\\" # else # define MAPDIR "/usr/local/lib/utf/" # endif #endif #ifdef __MSDOS__ # ifndef __COMPACT__ # error Memory model COMPACT required # endif # include # include #else # include #endif #include #include #include #include #include #include #ifdef __MSDOS__ #define strcasecmp(A, B) (stricmp((A), (B))) #endif #define BUFSIZE 2048 typedef enum { FALSE = 0, TRUE } BOOL_; typedef enum { uUNDEF = 0, uUTF7, uUTF8 } UTF_; typedef enum { aECHO, aSPACE, aSKIP, aFORMAT } ACTION_; typedef enum { cUCHAR, cUNSIGNED, cULONG, cNONE } CAST_; typedef struct { unsigned long ul, order; char *s; } TRANS_; typedef struct { long unsigned from, to; ACTION_ action; CAST_ cast; char *format; } RANGE_; BOOL_ verbose = FALSE, warnings = FALSE; UTF_ utf_type = uUNDEF; TRANS_ *trans = NULL; RANGE_ *range = NULL; char buf2 [BUFSIZE + 1], buffer [BUFSIZE + 1], bufword [BUFSIZE + 1], f_unsigned [] = "[U+%04X]", f_ulong [] = "[U+%08lX]", *infile, *lower [256], *no_mem_buffer, out_of_memory [] = "Out of memory", *programname, s_echo [] = "#ECHO#", s_skip [] = "#SKIP#", s_space [] = "#SPACE#"; int bufp, max_range = 0, max_trans = 0, n_range = 0, n_trans = 0, wtable [256]; unsigned int instep, outcode [2], outstep; unsigned long order = 0, incount = 1; void addchar (char *filename, int lineno, unsigned long ul, char *s), addaction ( char *filename, int lineno, long unsigned from, long unsigned to, ACTION_ action, CAST_ cast, char *format ), bytes2 (void), bytes3 (void), bytes4 (void), bytes5 (void), bytes6 (void), codewarn (unsigned long ul), errit (char const *format, ...), ferrit (char *filename, int lineno, char const *format, ...), get_programname (char const *argv0), nextout (void), outchar (unsigned char i), outsymbol (unsigned long ul), readtrans (char *file, char *dir, int level), *s_malloc (size_t size), *s_realloc (void *block, size_t size), syntax (void), utf7 (void); char *getbasename (char *s), *getdirname (char *s), *getword (char *filename, int lineno), *s_strdup (char const *s); int getline (FILE *fp, int *lineno), nlcount (void), searchcmp (const void *p1, const void *p2), srtcmp (const void *p1, const void *p2); long unsigned getvalue (char *filename, int lineno); #define YY_NO_UNPUT #define YY_SKIP_YYWRAP #ifdef yywrap # undef yywrap #endif int yywrap() { return 1; } %} %Start _utf7 _utf7b _utf8 %% .|\n { yyless (0); BEGIN ((utf_type == uUTF7) ? _utf7 : _utf8); } <_utf7>{ "+-" { outchar ('+'); } "+" { instep = outstep = 0; BEGIN _utf7b; } } <_utf7b>{ [A-Za-z0-9+/] { utf7 (); } "-" { BEGIN _utf7; } .|\n { if (yytext [0] == '\n') incount++; outchar (yytext [0]); BEGIN _utf7; } } <_utf8>{ [\300-\337]. { incount += nlcount (); bytes2 (); } [\340-\357].. { incount += nlcount (); bytes3 (); } [\360-\367]... { incount += nlcount (); bytes4 (); } [\370-\373].... { incount += nlcount (); bytes5 (); } [\374-\375]..... { incount += nlcount (); bytes6 (); } } <_utf7,_utf8>.|\n { if (yytext [0] == '\n') incount++; outchar (yytext [0]); } %% /* * Helper functions for UTF-7 parser */ void utf7 () { unsigned i, c; i = yytext [0]; if (i >= 'A' && i <= 'Z') c = i - 'A'; else if (i >= 'a' && i <= 'z') c = i + 26 - 'a'; else if (i >= '0' && i <= '9') c = i + 52 - '0'; else if (i == '+') c = 62; else if (i == '/') c = 63; switch (instep) { case 0: outcode [outstep] = (c << 2); break; case 1: outcode [outstep] |= (c >> 4); nextout (); outcode [outstep] = (c << 4); break; case 2: outcode [outstep] |= (c >> 2); nextout (); outcode [outstep] = (c << 6); break; case 3: outcode [outstep] |= c; nextout (); break; } if (++instep == 4) instep = 0; } void nextout () { unsigned c; if (outstep == 0) { outstep = 1; } else { outstep = 0; c = ((outcode [0] & 0xFF) << 8) | (outcode [1] & 0xFF); outsymbol (c); } } /* * Helper functions for UTF-8 parser */ void bytes2 () { unsigned u [2], c; int i; for (i = 0; i < 2; i++) u [i] = (unsigned char) yytext [i]; c = ( u [1] & 0x3F) | ((u [0] & 0x1F) << 6); outsymbol (c); } void bytes3 () { unsigned u [3], c; int i; for (i = 0; i < 3; i++) u [i] = (unsigned char) yytext [i]; c = ( u [2] & 0x3F) | ((u [1] & 0x3F) << 6) | ((u [0] & 0x0F) << 12); outsymbol (c); } void bytes4 () { long unsigned u [4], c; int i; for (i = 0; i < 4; i++) u [i] = (unsigned char) yytext [i]; c = ( u [3] & 0x3F) | ((u [2] & 0x3F) << 6) | ((u [1] & 0x3F) << 12) | ((u [0] & 0x07) << 18); outsymbol (c); } void bytes5 () { long unsigned u [5], c; int i; for (i = 0; i < 5; i++) u [i] = (unsigned char) yytext [i]; c = ( u [4] & 0x3F) | ((u [3] & 0x3F) << 6) | ((u [2] & 0x3F) << 12) | ((u [1] & 0x3F) << 18) | ((u [0] & 0x03) << 24); outsymbol (c); } void bytes6 () { long unsigned u [6], c; int i; for (i = 0; i < 6; i++) u [i] = (unsigned char) yytext [i]; c = ( u [5] & 0x3F) | ((u [4] & 0x3F) << 6) | ((u [3] & 0x3F) << 12) | ((u [2] & 0x3F) << 18) | ((u [1] & 0x3F) << 24) | ((u [0] & 0x01) << 30); outsymbol (c); } /* * General helper functions for parser */ void outchar (unsigned char c) { if (wtable [c]) codewarn (c); if (! lower [c]) fputc (c, yyout); else fputs (lower [c], yyout); } void outsymbol (unsigned long ul) { int i; TRANS_ *p; if (ul < 256) { if (wtable [ul]) codewarn (ul); if (! lower [ul]) fputc ((unsigned int) ul, yyout); else fputs (lower [ul], yyout); return; } p = (TRANS_ *) bsearch (&ul, trans, n_trans, sizeof (TRANS_), searchcmp); if (p) { fputs (p->s, yyout); return; } codewarn (ul); for (i = n_range - 1; i >= 0; i--) if (ul >= range [i].from && ul <= range [i].to) break; switch (range [i].action) { case aSPACE: fputc (' ', yyout); break; case aFORMAT: if (range [i].cast == cUCHAR) fprintf (yyout, range [i].format, (unsigned char) ul); else if (range [i].cast == cUNSIGNED) fprintf (yyout, range [i].format, (unsigned) ul); else fprintf (yyout, range [i].format, (unsigned long) ul); break; case aECHO: /* won't happen beyond 255 */ case aSKIP: break; } } void codewarn (unsigned long ul) { if (! warnings) return; if (ul < 0x10000) fprintf (stderr, "%s:%lu: U+%04X %5u\n", infile, incount, (unsigned) ul, (unsigned) ul); else fprintf (stderr, "%s:%lu: U+%08lX %10lu\n", infile, incount, ul, ul); } int nlcount () { int i, sum; sum = 0; for (i = 0; yytext [i]; i++) if (yytext [i] == '\n') sum++; return sum; } int main (int argc, char *argv []) { int i; no_mem_buffer = (char *) malloc (1024); get_programname (argv [0]); /* pre-defined actions */ for (i = 0; i < 256; i++) { lower [i] = NULL; wtable [i] = 0; } addaction (NULL, 0, 0x10000, 0x7FFFFFFF, aFORMAT, cULONG, f_ulong); addaction (NULL, 0, 0x100, 0xFFFF, aFORMAT, cUNSIGNED, f_unsigned); addaction (NULL, 0, 0x7F, 0x9F, aFORMAT, cUNSIGNED, f_unsigned); addaction (NULL, 0, 0, 0x1F, aFORMAT, cUNSIGNED, f_unsigned); addchar (NULL, 0, '\t', s_echo); addchar (NULL, 0, '\n', s_echo); addchar (NULL, 0, '\r', s_echo); addchar (NULL, 0, '\f', s_echo); while (argc > 1 && argv [1][0] == '-') { if (! strcmp (argv [1], "-7")) utf_type = uUTF7; else if (! strcmp (argv [1], "-8")) utf_type = uUTF8; else if (argv [1][1] == 'f') { if (argv [1][2]) readtrans (argv [1] + 2, MAPDIR, 0); else { if (argc == 2) errit ("Missing argument for option '-f'"); argv++; argc--; readtrans (argv [1], MAPDIR, 0); } } else if (! strcmp (argv [1], "-v")) verbose = TRUE; else if (! strcmp (argv [1], "-w")) warnings = TRUE; else syntax (); argv++; argc--; } if (n_trans) { qsort (trans, n_trans, sizeof (TRANS_), srtcmp); i = 0; while (i < n_trans - 1) if (trans [i].ul == trans [i + 1].ul) { memmove (trans + i, trans + i + 1, (n_trans - i - 1) * sizeof (TRANS_)); n_trans--; } else i++; } switch (argc) { case 1: if (isatty (fileno (stdin))) syntax (); yyin = stdin; infile = "(stdin)"; break; case 2: yyin = fopen (argv [1], "r"); if (! yyin) errit ("Opening file \"%s\": %s", argv [1], strerror (errno)); infile = argv [1]; break; default: syntax (); } if (! utf_type) errit ("Missing option '-7' or '-8'"); yyout = stdout; #ifdef __MSDOS__ setmode (fileno (yyin ), O_BINARY); setmode (fileno (yyout), O_BINARY); #endif yylex (); if (yyin != stdin) fclose (yyin); if (yyout != stdout) fclose (yyin); return 0; } int srtcmp (const void *p1, const void *p2) { unsigned long ul1, ul2; ul1 = ((TRANS_ *)p1)->ul; ul2 = ((TRANS_ *)p2)->ul; if (ul1 < ul2) return -1; else if (ul1 > ul2) return 1; ul1 = ((TRANS_ *)p1)->order; ul2 = ((TRANS_ *)p2)->order; if (ul1 < ul2) return -1; else return 1; } int searchcmp (const void *p1, const void *p2) { unsigned long ul1, ul2; ul1 = *((unsigned long *)p1); ul2 = ((TRANS_ *)p2)->ul; if (ul1 < ul2) return -1; else if (ul1 > ul2) return 1; else return 0; } void readtrans (char *file, char *dir, int level) { int lineno; long unsigned from, to, ul; char *s, *filename, *basename, *dirname; FILE *fp; CAST_ cast; if (level > 10) errit ("File \"%s\": nesting too deep", file); /* * Try opening file * If failure and filename has no directory part, then try in dir */ filename = file; fp = fopen (filename, "r"); if ((! fp) && dir) { basename = getbasename (filename); if (! strcmp (basename, filename)) { filename = (char *) s_malloc ( (strlen (basename) + strlen (dir) + 1) * sizeof (char) ); strcpy (filename, dir); strcat (filename, basename); fp = fopen (filename, "r"); } } if (! fp) errit ("Opening file \"%s\": %s", filename, strerror (errno)); if (verbose) fprintf (stderr, "Begin %s\n", filename); dirname = getdirname (filename); if (! dirname) dirname = dir; lineno = 0; while (getline (fp, &lineno)) { switch (buffer [bufp]) { /* translation for range of characters */ case 'd': case 'D': bufp++; from = getvalue (filename, lineno); to = getvalue (filename, lineno); s = getword (filename, lineno); if (! strcasecmp (s, s_skip)) addaction (filename, lineno, from, to, aSKIP, cNONE, NULL); else if (! strcasecmp (s, s_echo)) addaction (filename, lineno, from, to, aECHO, cNONE, NULL); else if (! strcasecmp (s, s_space)) addaction (filename, lineno, from, to, aSPACE, cNONE, NULL); else { if (! strcmp (s, "uchar")) cast = cUCHAR; else if (! strcmp (s, "unsigned")) cast = cUNSIGNED; else if (! strcmp (s, "ulong")) cast = cULONG; else ferrit (filename, lineno, "Illegal action \"%s\"", s); addaction (filename, lineno, from, to, aFORMAT, cast, buffer + bufp); } break; /* include file */ case 'i': case 'I': bufp++; readtrans (s_strdup (getword (filename, lineno)), dirname, level + 1); break; /* single character translation */ default: ul = getvalue (filename, lineno); addchar (filename, lineno, ul, buffer + bufp); } } fclose (fp); if (verbose) fprintf (stderr, "End %s\n", filename); } void addchar (char *filename, int lineno, unsigned long ul, char *s) { BOOL_ echo; if (ul > 0x7fffffff) ferrit (filename, lineno, "%s out of range: 0x%lX", s, ul); echo = FALSE; if (! strcasecmp (s, s_skip)) s = ""; else if (! strcasecmp (s, s_space)) s = " "; else if (! strcasecmp (s, s_echo)) echo = TRUE; if (ul < 256) { lower [ul] = echo ? NULL : s_strdup (s); wtable [ul] = 0; } else { if (echo) ferrit (filename, lineno, "%s out of range: 0x%lX", s_echo, ul); if (n_trans == max_trans) { max_trans += 1024; trans = (TRANS_ *) s_realloc (trans, max_trans * sizeof (TRANS_)); } trans [n_trans].ul = ul; trans [n_trans].order = order++; trans [n_trans++].s = s_strdup (s); } } void addaction (char *filename, int lineno, long unsigned from, long unsigned to, ACTION_ action, CAST_ cast, char *format) { long unsigned u; if (from > to) ferrit (filename, lineno, "Illegal range"); if (from > 0x7fffffff) ferrit (filename, lineno, "Begin of range to large: 0x%lX", from); if (to > 0x7fffffff) ferrit (filename, lineno, "End of range to large: 0x%lX", to); for (u = from; u <= to && u < 256; u++) { wtable [u] = 1; if (action == aSKIP) lower [u] = ""; else if (action == aSPACE) lower [u] = " "; else if (action == aECHO) lower [u] = NULL; else { if (cast == cUCHAR) sprintf (buf2, format, (unsigned char) u); else if (cast == cUNSIGNED) sprintf (buf2, format, (unsigned) u); else sprintf (buf2, format, (long unsigned) u); lower [u] = s_strdup (buf2); } } if (from < 256) from = 256; if (from > to) return; if (action == aECHO) ferrit (filename, lineno, "Out of range for %s", s_echo); if (n_range == max_range) { max_range += 256; range = (RANGE_ *) s_realloc (range, max_range * sizeof (RANGE_)); } range [n_range].from = from; range [n_range].to = to; range [n_range].action = action; if (action == aFORMAT) { range [n_range].cast = cast; range [n_range].format = s_strdup (format); } n_range++; } long unsigned getvalue (char *filename, int lineno) { long unsigned ulong; int n; char *format; while (buffer [bufp] && isspace ((unsigned char) buffer [bufp])) bufp++; if (((buffer [bufp] == 'u' || buffer [bufp] == 'U') && buffer [bufp + 1] == '+') || (buffer [bufp] == '0' && (buffer [bufp + 1] == 'x' || buffer [bufp + 1] == 'X'))) { bufp += 2; format = "%lx%n"; } else if (buffer [bufp] == '0') format = "%lo%n"; else format = "%lu%n"; if (sscanf (buffer + bufp, format, &ulong, &n) != 1) errit ("Missing value in \"%s\", line %i", filename, lineno); bufp += n; while (buffer [bufp] && isspace ((unsigned char) buffer [bufp])) bufp++; return ulong; } char *getword (char *filename, int lineno) { int n; while (buffer [bufp] && isspace ((unsigned char) buffer [bufp])) bufp++; if (sscanf (buffer + bufp, "%s%n", bufword, &n) != 1) errit ("Missing word in \"%s\", line %i", filename, lineno); bufp += n; while (buffer [bufp] && isspace ((unsigned char) buffer [bufp])) bufp++; return bufword; } char *getbasename (char *filename) { char *p; #ifdef __MSDOS__ p = strrchr (filename, '\\'); #else /* unix */ p = strrchr (filename, '/'); #endif if (p) return p + 1; else return filename; } char *getdirname (char *filename) { char c, *p, *dir; #ifdef __MSDOS__ p = strrchr (filename, '\\'); #else /* unix */ p = strrchr (filename, '/'); #endif if (p) { c = p [1]; p [1] = '\0'; dir = s_strdup (filename); p [1] = c; return dir; } else return NULL; } int getline (FILE *fp, int *lineno) { int i; for (;;) { if (fgets (buffer, BUFSIZE, fp) == NULL) return 0; (*lineno)++; i = strlen (buffer); while (i) if (isspace ((unsigned char) buffer [i - 1])) buffer [--i] = '\0'; else break; bufp = 0; while (buffer [bufp] && isspace ((unsigned char) buffer [bufp])) bufp++; if (buffer [bufp] == '#') continue; if (buffer [bufp]) return 1; } } void ferrit (char *filename, int lineno, char const *format, ...) { va_list list; fprintf (stderr, "\nError %s: in file \"%s\", line %i: ", programname, filename, lineno); va_start (list, format); vfprintf (stderr, format, list); fprintf (stderr, "\n\n"); exit (1); } void errit (char const *format, ...) { va_list list; fprintf (stderr, "\nError %s: ", programname); va_start (list, format); vfprintf (stderr, format, list); fprintf (stderr, "\n\n"); exit (1); } void get_programname (char const *argv0) { #ifdef __MSDOS__ char name [MAXFILE]; fnsplit (argv0, NULL, NULL, name, NULL); programname = strdup (name); #else /* unix */ char *p; p = strrchr (argv0, '/'); if (p) programname = strdup (p + 1); else programname = strdup (argv0); #endif } void *s_malloc (size_t size) { void *p; p = malloc (size); if (! p) { free (no_mem_buffer); errit (out_of_memory); } return p; } void *s_realloc (void *block, size_t size) { void *p; p = realloc (block, size); if (! p) { free (no_mem_buffer); errit (out_of_memory); } return p; } char *s_strdup (char const *s) { char *s1; if (s) { s1 = (char *) s_malloc (strlen (s) + 1); strcpy (s1, s); } else { s1 = (char *) s_malloc (1); s1 [0] = '\0'; } return s1; } void syntax () { fprintf ( stderr, "\n" "This is utf2any, version " UTFanyVERSION "\n" "\n" "Usage: %s -7|-8 [-f mapfile] [-v] [-w] [infile]\n" "\n" " -7 : Input is UTF-7\n" " -8 : Input is UTF-8\n" " -f : File with definitions of the symbol mappings\n" " If multiple -f options are given, the files are processed in turn\n" " -v : Verbose\n" " -w : Warning messages\n" "\n", programname ); exit (1); }