#!/usr/bin/env python # escapeTeXt.py # Convert text to something LaTeX is less likely to choke on. # 2006-Aug-24 Jim Hefferon Written. Based on stuff from Tristan Miller and # Allin Cottrell # Like txt2latex but beyond being callable from the command line # ./escapeTeXt.py < file1.txt > file1.ltx # it is callable from a program as a function. # import escapeTeXt # result=escapeTeXt.escapetext(string).output() # TODO # * at sign? # * copyright, trademark? # * allow conversion of "_Great Gatsby_" to "\textit{Great Gatsby}" # * allow conversion of "I *really* do" to "I \emph{really} do" # * add refinement of recognizing fractions "2/3" ==> "$2/3$"? import sys, re from getopt import getopt, GetoptError DEBUG=False # currently unused class escapetextError(StandardError): # currently unused pass class escapetext(object): """Turn plain text into strings that have a chance of making it through LaTeX. """ def __init__(self,txt='',countQuotes=True,refinements=False): """Initialize instance. Build up the text in parts (say, as lines from the input source) with this initializing, and using the feed() routine. Then we dump it with output(). txt='' Text to start with countQuotes=True Should we try to match open with closed quotes? refinements=False Should we try to get fancy? (Try to fix en dashes, em dashes, the 'p. 2' construct.) """ self.countQuotes=countQuotes self.refinements=refinements self.singleQuoteCount=0 self.doubleQuoteCount=0 self.strings=[txt] # array of strings that we have been fed def feed(self,txt): """Add more text to the pile. txt Text to add """ self.strings.append(txt) pageNumberPattern="(\s)(p.)(\s)(\d)" # like 'p. 23' pageNumberRE=re.compile(pageNumberPattern) enDashPattern="(\d)-(\d)" # like '7-9' enDashRE=re.compile(enDashPattern) emDashPattern="((\D)-{1,2}(\D))" # like 'but-no' emDashRE=re.compile(emDashPattern) def texifyString(self,s): """Convert a string to a form more acceptable to LaTeX. s Plain text string. """ # Do simple substitutions for (old,new) in [(u"\\",u"{\\textbackslash}"), # backslashes (u"{",u"\\{"), # open curly braces (u"}",u"\\}"), # close curly braces (u"\\{\\textbackslash\\}",u"{\\textbackslash}"), # have to fix up the effect of the prior two on the first line's result (u"$",u"\\$"), # dollar signs (u"%",u"\\%"), # percent signs (u"_",u"\\_"), # underscores (u"&",u"\\&"), # ampersands (u"#",u"\\#"), # sharp signs (u"<",u"{\\textless}"), # less than (u">",u"{\\textgreater}"), # greater than ]: s=s.replace(old,new) if self.countQuotes: # make even-numbered instances open, odd closed tmpS="" priorC=None for c in s: if c=="'": if (self.singleQuoteCount % 2)==0: if priorC=='"': # open double quote followed by open single quote tmpS+=u"\," # add a thinspace tmpS+=u"`" # open single quote else: tmpS+=u"'" # close single quote self.singleQuoteCount+=1 elif c=='"': if (self.doubleQuoteCount % 2)==0: tmpS+=u"``" # open double quotes else: if priorC=="'": # single close quote followed by double close quote tmpS+=u"\," # add a thinspace tmpS+=u"''" # close double quotes self.doubleQuoteCount+=1 else: tmpS+=c priorC=c s=tmpS else: s=s.replace(u'`',u"\verb!'!") # maybe u"\textquotesingle" if you \usepackage{textcomp}? s=s.replace(u'"',u'\verb!"!') # maybe u"$^\\textquotestraightdblbase$" if you \usepackage{textcomp}? if self.refinements: s=escapetext.pageNumberRE.sub(r'\1\2\,\4',s) # replace " p. 2" with " p.\,2" s=escapetext.enDashRE.sub(r'\1--\2',s) # replace "2-3" with "2--3" s=escapetext.emDashRE.sub(r'\2---\3',s) # replace "but -- no" with "but --- no" return s def output(self,restartCounts=False): """Return a string that has the parts escaped. That clears the internal buffer. restartCounts=False Reset the counters for whether a single or double quote is opening or closed """ r=map(self.texifyString,self.strings) self.strings=[] if restartCounts: self.singleQuoteCount=0 self.doubleQuoteCount=0 return "".join(r) latexHead="""\\documentclass{article} \\begin{document} """ latexFoot="""\\end{document}""" #............... script main body if __name__=='__main__': # parse args inputFilename=None outputFilename=None countQuotes=True refinements=True latexWrap=False verbose=False # currently unused usage="""%s: Convert plain text so it may make it through LaTeX %s [options] where the options are -f filename (default %s) file to read from; if None then stdin is used -o filename (default %s) file to write to; if None then stdout is used -c Turn off the attempt to balance open and closed quotes -r Turn off the refinements (em dashes, en dashes ..) -l Add simple LaTeX header and footer -v (default %s) sets verbose output --help or -? Give this usage statement""" % (sys.argv[0],sys.argv[0],repr(inputFilename),repr(outputFilename),repr(verbose)) shortOptions='f:o:lcr?v' longOptions=['help'] try: (opts,args_proper)=getopt(sys.argv[1:],shortOptions,longOptions) except GetoptError, err: print "ERROR: Unable to parse the command line arguments: %s" % (err,) print usage sys.exit(1) for (option,parameter) in opts: if option=='-f': inputFilename=parameter elif option=='-o': outputFilename=parameter elif option=='-c': countQuotes=False elif option=='-r': refinements=False elif option=='-l': latexWrap=True elif option=='-v': verbose=True elif (option=='-?' or option=='--help'): print usage sys.exit(0) else: print "Unknown option: ",option sys.exit(2) # Done getting options; now the logic if (inputFilename is None): inFile=sys.stdin else: inFile=open(inputFilename,'r') if (outputFilename is None): outFile=sys.stdout else: outFile=open(outputFilename,'w') et=escapetext(countQuotes=countQuotes,refinements=refinements) line=inFile.readline() if latexWrap: outFile.write(latexHead) while line: et.feed(line) outFile.write(et.output()) line=inFile.readline() if latexWrap: outFile.write(latexFoot) inFile.close() outFile.close() if verbose: print "#s: done" % (sys.argv[0],)