/* program Wsltex */ /* ******************************************************************** Copy an 8-bit file to a 7-bit file, turning characters with bit 8 set into 4-character octal escape sequences \nnn. This is useful in analyzing WordStar microcomputer word processor files. Many sequences which can be recognized as doing something useful are turned into LaTeX command sequences. Comments below describe the sequences which are recognized and translated. Usage: @WSLTEX WordStar_inputfile >LaTeX_outputfile Remarks: This program was converted automatically from Pascal to C by James Peterson's Pascal-to-C translator from Carnegie-Mellon University. Hand editing was required on all I/O references, which are not converted satisfactorily (e.g. the Pascal file variable points to the current input character returned by get(), while in C, one must explicitly save the result of getc()). Comment and type declaration formatting where also rearranged by hand. Defined constants were changed to upper-case following the usual convenient C convention of visible differences between variables and constants, and functions and macros. The conversion of a test 188K Ph.D. thesis file produced identical results with both versions. Pascal compilation was much faster than the PCC compiler. The conversion time with the Pascal version was 15.5 sec. In the C version using putc(), it was 25.1 sec; changing putc() to use of write() with large buffers further reduced the time to 21.2 sec. [21-Dec-85] conversion from Pascal [11-Oct-85] original version ******************************************************************** */ #include #include /* for TOPS-20 open magic */ /* One of OS_UNIX or OS_TOPS20 should be defined at compile time. */ /* Default is OS_UNIX */ #ifndef OS_TOPS20 #define OS_TOPS20 0 #define OS_UNIX 1 #else #define OS_UNIX 0 #endif #if OS_UNIX #define WRITELN(fp) putc('\n',fp) #else #define WRITELN(fp) {putc('\r',fp);putc('\n',fp);} #endif #if OS_TOPS20 #define EMPTY_BUFFER {if (the_next > 0) \ _write(fileno(LaTeX_file),the_buffer,the_next);\ the_next = 0;} #else #define EMPTY_BUFFER {if (the_next > 0) \ write(fileno(LaTeX_file),the_buffer,the_next);\ the_next = 0;} #endif typedef int BOOLEAN; #define FALSE 0 #define TRUE 1 #define NUL '\0' #define BEL '\07' #define CR '\r' #define ESC '\033' #define FF '\f' #define HT '\t' #define LF '\n' #define MAXRING 256 /* limit for character ring */ #define MAXSPECIAL 100 /* limit for warning when special strings (underline, bold, etc) exceed this length */ #define MAXBUF 16384 /* large buffer for I/O efficiency */ #define UNSET 0x8000 /* use most-negative 16-bit integer */ FILE *WordStar_file; /* 8-bit input file */ FILE *LaTeX_file; /* 7-bit or 8-bit output file */ char *the_buffer; /* the output buffer (malloc()'ed) */ int the_next; /* index of next empty slot in the_buffer[]*/ int ring_index; /* index into ring[] */ char ring[MAXRING+1]; /* ring buffer for current output context */ BOOLEAN hyphen_break; /* hyphen break flag */ BOOLEAN eol; /* end-of-line flag */ BOOLEAN is_underline; /* underline et al flags */ BOOLEAN is_subscript; BOOLEAN is_superscript; BOOLEAN is_bold; BOOLEAN is_alternate; int bi_eol; /* input byte offset at end-of-line */ int bi_underline; int bi_subscript; int bi_superscript; int bi_bold; int bi_alternate; int bo_eol; /* output byte offset at end-of-line */ int bo_underline; int bo_subscript; int bo_superscript; int bo_bold; int bo_alternate; int input_byte_offset; int output_byte_offset; int last_c; /* previous input character */ /* global functions */ #if OS_TOPS20 #define Begin_alternate Beg_alternate /* unique to 6 characters */ #define Begin_boldface Beg_boldface #endif void Begin_alternate (); void Begin_boldface (); void End_alternate (); void End_boldface (); void Line_feed (); void Superscript (); void Subscript (); void Underline (); void Warning (); void W_byte (); void W_char (); void W_octal (); void W_string (); char *strchr(); /**********************************************************************/ /*-->main */ main(argc,argv) /* Main -- WSLTEX */ int argc; char *argv[]; { int fp; register int c; /* current input character */ register int c7; /* low-order 7 bits of c */ if (argc < 2) { (void)fprintf(stderr, "%%Usage: %s WordStar_infile >LaTeX_outfile",argv[0]); WRITELN(stderr); exit(1); } /* only 7-bit bytes required, so stdout is okay to use provided it does not do CRLF <--> LF translation. On TOPS-20, this is suppressed by loading this program with a special version of the I/O library. */ LaTeX_file = stdout; #if OS_TOPS20 /* Read file in bytesize 8 with binary flag set to prevent CRLF <--> LF mapping; the "rb" or "wb" flag is not sufficient for this because PCC-20 maintains two places internally where the binary flag is set, and both are used! */ if ((fp = open(argv[1], FATT_RDONLY | FATT_SETSIZE | FATT_BINARY,8)) >= 0) WordStar_file = fdopen(fp,"rb"); else WordStar_file = (FILE *)NULL; #else WordStar_file = fopen(argv[1],"rb"); #endif if (WordStar_file == (FILE *)NULL) { (void)fprintf(stderr, "?Open failure on WordStar input file [%s]",argv[1]); WRITELN(stderr); exit(1); } /* Initializations -- in alphabetical order */ bi_alternate = -1; bi_bold = -1; bi_subscript = -1; bi_superscript = -1; bi_underline = -1; bo_alternate = -1; bo_bold = -1; bo_subscript = -1; bo_superscript = -1; bo_underline = -1; eol = TRUE; hyphen_break = FALSE; input_byte_offset = -1; is_bold = FALSE; is_subscript = FALSE; is_superscript = FALSE; is_underline = FALSE; last_c = -1; output_byte_offset = -1; for (ring_index=0; ring_index <= MAXRING; ring_index++) ring[ring_index] = ' '; ring_index = 0; if ((the_buffer = (char *)malloc(MAXBUF)) == (char *)NULL) { (void)fprintf(stderr, "?Unable to allocate output buffer of size %d",MAXBUF); WRITELN(stderr); exit(1); } the_next = 0; /* ===================================================================== WordStar makes heavy use of the 8-th (high-order) bit of 8-bit ASCII characters. In general, it turns on that bit for -- any character (0..127) at end-of-word -- any character (0..127) at end-of-line -- "hard" line break on CTL-M -- "hard" page break on CTL-L A discretionary hyphen (which can be removed on joining the lines) at end-of-line is encoded as "-\215" (i.e. "-<\200|CTL-M>"). A required hyphen falling at end-of-line is encoded as "-\040\215" (i.e. followed by a space). Comment lines and WordStar command lines begin with a dot. "Hard" line breaks are encoded as \215\015 (i.e. "<\200|CTL-M>"); these occur between double-space lines. "Soft" line breaks are encoded as ; these occur at paragraph ends. Centered text is stored centered without any special markings. Underlined text is bracketed by CTL-S. Boldface text is bracketed by CTL-E ..text.. CTL-R. Superscripts are bracketed by CTL-T. Subscripts are bracketed by CTL-V. Alternate typewheel strings appear as CTL-Q ..text.. CTL-W. Some of these at least are mnemonic (e = epsilon, m = mu, . = centered dot). CTL-R also appears after all sub/superscript sequences, apparently functioning as a "return to normal" state. ===================================================================== */ while ((c = getc(WordStar_file)) != EOF) { if (c < 32) /* ordinary control character */ { switch (c) { case 0: case 1: case 2: case 3: case 4: /* CTL-@ .. CTL-D */ W_octal(c); break; case 5: /* CTL-E */ Begin_boldface(); break; case 6: case 7: case 8: /* CTL-F .. CTL-H */ W_octal(c); break; case 9: /* CTL-I */ W_byte(HT); break; case 10: /* CTL-J */ Line_feed(); break; case 11: /* CTL-K */ W_octal(c); break; case 12: /* CTL-L */ W_byte(FF); break; case 13: /* CTL-M */ if (last_c == (int)('-')) hyphen_break = TRUE; else if (!hyphen_break) W_byte(CR); break; case 14: case 15: case 16: /* CTL-N .. CTL-P */ W_octal(c); break; case 17: /* CTL-Q */ Begin_alternate(); break; case 18: /* CTL-R */ if (is_bold) End_boldface(); /* ELSE discard character */ break; case 19: /* CTL-S */ Underline(); break; case 20: /* CTL-T */ Superscript(); break; case 21: /* CTL-U */ W_octal(c); break; case 22: /* CTL-V */ Subscript(); break; case 23: /* CTL-W */ End_alternate(); break; case 24: case 25: case 26: case 27: case 28: case 29: case 30: case 31: /* CTL-X .. CTL-_ */ W_octal(c); break; } /* case */ } else if (strchr("#$%&~_^\\{}",c)) /* LaTeX special characters must be quoted by backslash */ { W_char('\\'); W_byte(c); } else if (c == (int)('.')) /* check for WordStar comment */ { if (eol) W_string("%-WS-% "); W_byte(c); } else if (c < 128) /* c in 32 .. 127 */ W_byte(c); else /* c > 127 */ { c7 = c & 0177; /* lop off 8-th bit */ switch (c7) { case 0: /* CTL-@ */ W_octal(c7); break; case 1: /* CTL-A */ W_octal(c7); break; case 2: /* CTL-B */ W_octal(c7); break; case 3: /* CTL-C */ W_octal(c7); break; case 4: /* CTL-D */ W_octal(c7); break; case 5: /* CTL-E */ Begin_boldface(); break; case 6: /* CTL-F */ W_octal(c7); break; case 7: /* CTL-G */ W_octal(c7); break; case 8: /* CTL-H */ W_octal(c7); break; case 9: /* CTL-I */ W_byte(HT); break; case 10: /* CTL-J */ Line_feed(); break; case 11: /* CTL-K */ W_octal(c7); break; case 12: /* CTL-L */ W_byte(FF); break; case 13: /* CTL-M */ if (last_c == (int)('-')) hyphen_break = TRUE; else if (!hyphen_break) { if (bo_eol < output_byte_offset) W_byte(CR); /* not double space */ } break; case 14: /* CTL-N */ W_octal(c7); break; case 15: /* CTL-O */ W_octal(c7); break; case 16: /* CTL-P */ W_octal(c7); break; case 17: /* CTL-Q */ Begin_alternate(); break; case 18: /* CTL-R */ if (is_bold) End_boldface(); /* ELSE discard character */ break; case 19: /* CTL-S */ Underline(); break; case 20: /* CTL-T */ Superscript(); break; case 21: /* CTL-U */ W_octal(c7); break; case 22: /* CTL-V */ Subscript(); break; case 23: /* CTL-W */ End_alternate(); break; case 24: /* CTL-X */ W_octal(c7); break; case 25: /* CTL-Y */ W_octal(c7); break; case 26: /* CTL-Z */ W_octal(c7); break; case 27: /* CTL-[ */ W_octal(c7); break; case 28: /* CTL-\ */ W_octal(c7); break; case 29: /* CTL-] */ W_octal(c7); break; case 30: /* CTL-^ */ W_octal(c7); break; case 31: /* CTL-_ */ W_octal(c7); break; case 32: case 33: case 34: case 35: case 36: case 37: case 38: case 39: case 40: case 41: case 42: case 43: case 44: case 45: W_byte(c7); break; case 46: /* period */ if (eol) W_string("%-WS-% "); W_byte(c7); break; case 47: case 48: case 49: case 50: case 51: case 52: case 53: case 54: case 55: case 56: case 57: case 58: case 59: case 60: case 61: case 62: case 63: case 64: case 65: case 66: case 67: case 68: case 69: case 70: case 71: case 72: case 73: case 74: case 75: case 76: case 77: case 78: case 79: case 80: case 81: case 82: case 83: case 84: case 85: case 86: case 87: case 88: case 89: case 90: case 91: case 92: case 93: case 94: case 95: case 96: case 97: case 98: case 99: case 100: case 101: case 102: case 103: case 104: case 105: case 106: case 107: case 108: case 109: case 110: case 111: case 112: case 113: case 114: case 115: case 116: case 117: case 118: case 119: case 120: case 121: case 122: case 123: case 124: case 125: case 126: W_byte(c7); break; case 127: W_octal(c7); break; default: W_octal(c7); break; } /* CASE */ } last_c = c; input_byte_offset++; } EMPTY_BUFFER; } /* Main -- WSLTEX */ /**********************************************************************/ /*-->Begin_alternate */ void Begin_alternate () { if (is_alternate) Warning("BEGIN ALTERNATE CHARACTER SET command encounted inside\ alternate sequence", bo_alternate,output_byte_offset); bi_alternate = input_byte_offset; bo_alternate = output_byte_offset; is_alternate = TRUE; W_string("{\\alt "); } /* Begin_alternate */ /**********************************************************************/ /*-->Begin_boldface */ void Begin_boldface () { if (is_bold) Warning("BEGIN BOLDFACE command encountered inside boldface", bo_bold,output_byte_offset); bi_bold = input_byte_offset; bo_bold = output_byte_offset; is_bold = TRUE; W_string("{\\bf "); } /* Begin_boldface */ /**********************************************************************/ /*-->End_alternate */ void End_alternate () { if (is_alternate) { if ((bo_alternate + MAXSPECIAL) < output_byte_offset) Warning("Long ALTERNATE CHARACTER SET sequence", bo_alternate,output_byte_offset); } else Warning("END ALTERNATE CHARACTER SET command encountered\ outside alternate text",bo_alternate,output_byte_offset); is_alternate = FALSE; W_byte((int)('}')); } /* End_alternate */ /**********************************************************************/ /*-->End_boldface */ void End_boldface () { if (is_bold) { if ((bo_bold + MAXSPECIAL) < output_byte_offset) Warning("Long BOLDFACE sequence", bo_bold,output_byte_offset); } else Warning("END BOLDFACE command encountered outside boldface", bo_bold,output_byte_offset); is_bold = FALSE; W_byte((int)('}')); } /* End_boldface */ /**********************************************************************/ /*-->Line_feed */ void Line_feed () { if (hyphen_break) /* discard line break */ hyphen_break = FALSE; else { if (bo_eol < output_byte_offset) /* text on line */ W_byte(LF); else /* empty line */ { if (last_c != (0200 | CR)) W_byte(LF); } } } /* Line_feed */ /**********************************************************************/ /*-->strchr() */ char * strchr(s,c) /* return pointer to c in s[], or (char*)NULL */ char *s; char c; { while (*s && (*s != c)) ++s; if (*s == c) return (s); else return ((char *)NULL); } /* strchr */ /**********************************************************************/ /*-->Superscript */ void Superscript () { if (is_superscript) /* ending old superscript */ { if ((bo_superscript + MAXSPECIAL) < output_byte_offset) Warning("Long SUPERSCRIPT sequence", bo_superscript,output_byte_offset); W_byte((int)('}')); } else /* beginning new superscript */ { bi_superscript = input_byte_offset; bo_superscript = output_byte_offset; W_string("^{"); } is_superscript = !is_superscript; } /* Superscipt */ /**********************************************************************/ /*-->Subscript */ void Subscript () { if (is_subscript) /* ending old superscript */ { if ((bo_subscript + MAXSPECIAL) < output_byte_offset) Warning("Long SUBSCRIPT sequence", bo_subscript,output_byte_offset); W_byte((int)('}')); } else /* beginning new superscript */ { bi_subscript = input_byte_offset; bo_subscript = output_byte_offset; W_string("_{"); } is_subscript = !is_subscript; } /* Subscript */ /**********************************************************************/ /*-->Underline */ void Underline () { if (is_underline) /* ending old underline */ { if ((bo_underline + MAXSPECIAL) < output_byte_offset) Warning("Long UNDERLINE sequence", bo_underline,output_byte_offset); W_byte((int)('}')); } else /* beginning new underline */ { bi_underline = input_byte_offset; bo_underline = output_byte_offset; W_string("{\\em "); } /* underlined text is \emphasized in LaTeX */ is_underline = !is_underline; } /* Underline */ /**********************************************************************/ /*-->Warning */ void Warning (s,b1,b2) char s[]; int b1, b2 /* output byte range, or UNSET,UNSET */; { int k; WRITELN(stderr); /* !!!! temporary for debugging !!!! */ WRITELN(stderr); /* !!!! temporary for debugging !!!! */ (void)fprintf(stderr, "------------------------------------------------------------"); WRITELN(stderr); (void)fprintf(stderr,"%s",s); WRITELN(stderr); if (b1 != UNSET) { (void)fprintf(stderr,"\toutput byte range = %d .. %d",b1,b2); WRITELN(stderr); } (void)fprintf(stderr,"\tinput byte offset = %d",input_byte_offset); WRITELN(stderr); (void)fprintf(stderr,"\toutput byte offset = %d",output_byte_offset); WRITELN(stderr); (void)fprintf(stderr,"\tcurrent context = ["); WRITELN(stderr); for (k=ring_index; k <= MAXRING; k++) putc(ring[k],stderr); for (k=0; k <= ((MAXRING + ring_index - 1) % MAXRING); k++) putc(ring[k],stderr); WRITELN(stderr); (void)fprintf(stderr,"\t]"); WRITELN(stderr); (void)fprintf(stderr, "------------------------------------------------------------"); WRITELN(stderr); } /* Warning */ /**********************************************************************/ /*-->W_byte */ void W_byte (c) int c; { ring[ring_index] = (char)c; ring_index = (ring_index++) % MAXRING; /* All output bytes go through this single piece of code, so we optimize output by buffering it in large blocks. The use of write() instead of fwrite() avoids CR LF <--> LF translation on most machines, and on some is significantly more efficient. */ if (the_next >= MAXBUF) EMPTY_BUFFER; the_buffer[the_next++] = c; eol = (c == LF); output_byte_offset++; if (eol) { bi_eol = input_byte_offset; bo_eol = output_byte_offset; } } /* W_byte */ /**********************************************************************/ /*-->W_char */ void W_char (c) char c; { W_byte((int)(c)); } /* W_char */ /**********************************************************************/ /*-->W_octal */ void W_octal (c) int c; { c = c & 0377; W_byte((int)('\\')); W_byte((c >> 6) + (int)('0')); W_byte(((c >> 3) & 07) + (int)('0')); W_byte((c & 07) + (int)('0')); } /* W_octal */ /**********************************************************************/ /*-->W_string */ void W_string (s) char *s; { while (*s) W_byte((int)(*s++)); } /* W_string */