package delig; import java.io.*; import java.util.Hashtable; import java.util.regex.*; /** * DeLig disables misplaced ligatures in LaTeX documents * using a dictionary approach. * * DeLig is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * It may be copied or modified under the terms of the * GNU General Public License version 3 * as published by the Free Software Foundation * (http://www.gnu.org/copyleft/gpl.html). * * This version is intended for German language texts only. * DeLig is partially based on the Perl script rmligs by Björn Jacke, * in particular the wordlist data is based on his igerman98 dictionary. * * @author Daniel Warner, delig@nospam@daniel-warner.de * @version 2008-03-09 */ public class DeLig { private int minLen = 4; private boolean prefixAllowed = true; private boolean suffixAllowed = true; private boolean properSubwordAllowed = true; private int tokensCharShift = 256; private String [][] tokens = {{"\"-", ""}, {"\\-", ""}, {"\"\"", ""}, {"{\\\"a}", "ä"}, {"\\\"a", "ä"}, {"\\\"{a}", "ä"}, {"\"a", "ä"}, {"{\\\"o}", "ö"}, {"\\\"o", "ö"}, {"\\\"{o}", "ö"}, {"\"o", "ö"}, {"{\\\"u}", "ü"}, {"\\\"u", "ü"}, {"\\\"{u}", "ü"}, {"\"u", "ü"}, {"{\\\"A}", "Ä"}, {"\\\"A", "Ä"}, {"\\\"{A}", "Ä"}, {"\"A", "Ä"}, {"{\\\"O}", "Ö"}, {"\\\"O", "Ö"}, {"\\\"{O}", "Ö"}, {"\"O", "Ö"}, {"{\\\"U}", "Ü"}, {"\\\"U", "Ü"}, {"\\\"{U}", "Ü"}, {"\"U", "Ü"}, {"\\ss{}", "ß"}, {"\\ss", "ß"}, {"\"s", "ß"}, {"\"z", "ß"}, }; public void process(String[] args) throws DeLigException { println("DeLig [Version 0.1]"); println("Author: Daniel Warner (delig@nospam@daniel-warner.de)"); println(); println("\tDeLig disables misplaced ligatures in LaTeX documents"); println("\tusing a dictionary approach."); println(); println("DeLig is distributed in the hope that it will be useful,"); println("but WITHOUT ANY WARRANTY; without even the implied warranty of"); println("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."); println(); println("It may be copied or modified under the terms of the"); println("GNU General Public License version 3"); println("as published by the Free Software Foundation"); println("(http://www.gnu.org/copyleft/gpl.html)."); println(); if (args.length != 2) { println("Usage: DeLig inFilename outFilename"); } else { try { // Build ligature hash table Hashtable hash = new Hashtable(); try { BufferedReader ligatureFile = new BufferedReader(new FileReader("DeLig.list")); String value; while ((value = ligatureFile.readLine()) != null) { String key = value.replace("|", "").toLowerCase(); hash.put(key, value); } ligatureFile.close(); } catch (IOException e) { String errorMessage = "Error: Could not read ligature file DeLig.list."; throw new DeLigException(errorMessage); } // Get filenames String inFilename = args[0]; String outFilename = args[1]; // Open input and output file if (!new File(inFilename).exists()) { String errorMessage = "Error: The input file " + inFilename + " does not exist."; throw new DeLigException(errorMessage); } if (new File(outFilename).exists()) { String errorMessage = "Error: The output file " + outFilename + " already exists."; throw new DeLigException(errorMessage); } LineNumberReader in = new LineNumberReader(new FileReader(inFilename)); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outFilename)), true); // Process input file and write output file println("Processing " + inFilename + " and writing output to " + outFilename + " ..."); println(); String leadingBackspacesExpr = "[\\\\]*"; String firstCharExpr = "([A-Za-zäöüÄÖÜ]|(\\{\\\\\"[aouAOU]\\})|(\\\\\"[aouAOU])|(\\\\\"\\{[aouAOU]\\})|(\"[aouAOU]))"; String subExpr = "([a-zäöüß]|(\\{\\\\\"[aou]\\})|(\\\\\"[aou])|(\\\\\"\\{[aou]\\})|(\"[aou])|(\\\\ss\\{\\})|(\"s)|(\"z)|(\"-)|(\\\\-)|(\"\"))*"; String ligatureExpr = "f[fil]"; String trailingSZ = "(\\\\ss)?"; String regExpr = leadingBackspacesExpr + firstCharExpr + subExpr + ligatureExpr + subExpr + trailingSZ; Pattern pattern = Pattern.compile(regExpr); int disabledLigatures = 0; // Counts the number of disabled ligatures String inLine; // Currently processed line of the input file while ((inLine = in.readLine()) != null) { if (inLine.length() == 0) { out.println(); continue; } String outLine = inLine; boolean lineModified = false; Matcher matcher = pattern.matcher(outLine); int regionEnd = outLine.indexOf("%"); // Ignore comments in LaTeX file, if any regionEnd = (regionEnd >= 0 ? regionEnd : outLine.length()); matcher.region(0, regionEnd); while (matcher.find()) { MatchResult match = matcher.toMatchResult(); String word = match.group(); // Determine ligature, if present String compareWord = word.replace("\\\\", "").toLowerCase(); // Remove double backslashes (newline) String workWord = word.replace("\\\\", ""); int removedBackslashes = word.length() - compareWord.length(); // Number of removed backslashes for (int i = 0; i < tokens.length; i++) { compareWord = compareWord.replace(tokens[i][0], tokens[i][1]); workWord = workWord.replace(tokens[i][0], ((char) (i + tokensCharShift)) + ""); } if (compareWord.charAt(0) != '\\') // If compareWord starts with a backslash, then it is a command and may not be modified { boolean keyFound = hash.containsKey(compareWord); boolean exactMatchFound = keyFound; boolean prefixFound = false; boolean suffixFound = false; boolean properSubwordFound = false; int start = 0; int end = compareWord.length(); if (prefixAllowed && !keyFound) { start = 0; end = compareWord.length(); while (end >= minLen && !(keyFound = hash.containsKey(compareWord.substring(0, end)))) { end--; } prefixFound = keyFound; } if (suffixAllowed && !keyFound) { start = 0; end = compareWord.length(); while (compareWord.length() - start >= minLen && !(keyFound = hash.containsKey(compareWord.substring(start, compareWord.length())))) { start++; } suffixFound = keyFound; } if (properSubwordAllowed && !keyFound) { for (String key : hash.keySet()) { if (compareWord.contains(key)) { start = compareWord.indexOf(key); end = start + key.length(); keyFound = true; break; } } properSubwordFound = keyFound; } if (keyFound) { String ligatureWord = hash.get(compareWord.substring(start, end)); String modifiedWord = ""; // i: Position in workWord, i - delta: Position in compareWord, ligPos: Position in ligatureWord int delta = 0; int ligPos = 0; int i = 0; while (i < workWord.length()) { boolean inLigatureWord = (i - delta >= start && i - delta < end); if (inLigatureWord && ligatureWord.charAt(ligPos) == '|') { modifiedWord += "\"|"; ligPos++; } else { int charValue = ((int) workWord.charAt(i) - tokensCharShift); if (charValue >= 0 && charValue < tokens.length) { modifiedWord += tokens[charValue][0]; delta += (1 - tokens[charValue][1].length()); if (inLigatureWord) { ligPos += tokens[charValue][1].length(); } } else { modifiedWord += workWord.charAt(i); if (inLigatureWord) { ligPos++; } } i++; } } outLine = outLine.substring(0, match.start() + removedBackslashes) + modifiedWord + outLine.substring(match.end(), outLine.length()); String ligaturePosition = ""; if (exactMatchFound) ligaturePosition = "exact match"; else if (prefixFound) ligaturePosition = "prefix"; else if (suffixFound) ligaturePosition = "suffix"; else if (properSubwordFound) ligaturePosition = "proper subword"; println("Disabled ligature [" + ligatureWord + ", " + ligaturePosition + "] in line " + in.getLineNumber() + ": " + word.substring(removedBackslashes) + " => " + modifiedWord); disabledLigatures++; lineModified = true; matcher = pattern.matcher(outLine); regionEnd = outLine.indexOf("%"); // Ignore comments in LaTeX file, if any regionEnd = (regionEnd >= 0 ? regionEnd : outLine.length()); matcher.region(0, regionEnd); } } } if (lineModified) { println(); println("Modified line " + in.getLineNumber() + ":"); println("Input : " + inLine); println(); println("Output : " + outLine); println(); } out.println(outLine); // Write output line to output file } // Print statistics if (disabledLigatures == 0) { println("No ligatures were disabled."); } else if (disabledLigatures == 1) { println("Disabled " + disabledLigatures + " ligature in " + inFilename + "."); } else { println("Disabled " + disabledLigatures + " ligatures in " + inFilename + "."); } // Close input and output file in.close(); out.close(); } catch (IOException e) { throw new DeLigException(e.getMessage()); } } } protected void println() { System.out.println(); } protected void println(String output) { System.out.println(output); } protected void print(String output) { System.out.print(output); } public static void main(String[] args) { try { new DeLig().process(args); } catch (DeLigException e) { System.err.println(e.getMessage()); } } }