--- spelling-stage-1.lua
--- Copyright 2012, 2013 Stephan Hennig
--
-- This work may be distributed and/or modified under the conditions of
-- the LaTeX Project Public License, either version 1.3 of this license
-- or (at your option) any later version. The latest version of this
-- license is in http://www.latex-project.org/lppl.txt
-- and version 1.3 or later is part of all distributions of LaTeX
-- version 2005/12/01 or later.
--
-- See file README for more information.
--
--- Handle lists of bad and good strings and match rules.
--
-- @author Stephan Hennig
-- @copyright 2012, 2013 Stephan Hennig
-- @release version 0.41
--
-- @trick Prevent LuaDoc from looking past here for module description.
--[[ Trick LuaDoc into entering 'module' mode without using that command.
module(...)
--]]
-- Module table.
local M = {}
-- Import external modules.
local unicode = require('unicode')
local xml = require('luaxml-mod-xml')
-- Function short-cuts.
local Sfind = string.find
local tabinsert = table.insert
local Ufind = unicode.utf8.find
local Ugmatch = unicode.utf8.gmatch
local Usub = unicode.utf8.sub
-- Declare local variables to store references to resources that are
-- provided by external code.
--
-- Table of known bad strings.
local __is_bad
--
-- Table of known good strings.
local __is_good
--
-- Table of bad rules.
local __rules_bad
--
-- Table of good rules.
local __rules_good
--- Generic function for reading bad or good spellings from a file.
-- All data from the file is read into a string, which is then parsed by
-- the given parse function.
--
-- @param fname File name.
-- @param parse_string Custom parse function.
-- @param t Mapping table bad or good spellings should be added to.
-- @param hint String for info message. Either `bad` or `good`.
local function __parse_file(fname, parse_string, t, hint)
local total_c = 0
local new_c = 0
local f, err = io.open(fname, 'r')
if f then
local s = f:read('*all')
f:close()
total_c, new_c = parse_string(s, t)
else
texio.write_nl('package spelling: Warning! ' .. err)
end
texio.write_nl('package spelling: Info! ' .. total_c .. '/' .. new_c .. ' total/new ' .. hint .. ' strings read from file \'' .. fname .. '\'.')
end
--- Generic function for parsing a string containing a plain list of
-- strings. Input format are strings separated by new line or carriage
-- return, i.e., one string per line. All lines found in the list are
-- mapped to the boolean value `true` in the given table.
--
-- @param s Input string (a list of strings).
-- @param t Table that maps strings to the value `true`.
-- @return Number of total and new strings found.
local function __parse_plain_list(s, t)
local total_c = 0
local new_c = 0
-- Iterate line-wise through input string.
for l in Ugmatch(s, '[^\r\n]+') do
-- Map string to boolean value `true`.
if not t[l] then
t[l] = true
new_c = new_c + 1
end
total_c = total_c + 1
end
return total_c, new_c
end
--- Parse a plain list of bad strings read from a file.
-- All strings found (words with known incorrect spelling) are mapped to
-- the boolean value `true` in table `__is_bad`. The format of the
-- input file is one string per line.
--
-- @param fname File name.
local function parse_bad_plain_list_file(fname)
__parse_file(fname, __parse_plain_list, __is_bad, 'bad')
end
M.parse_bad_plain_list_file = parse_bad_plain_list_file
--- Parse a plain list of good strings read from a file.
-- All strings found (words with known correct spelling) are mapped to
-- the boolean value `true` in table `__is_good`. The format of the
-- input file is one string per line.
--
-- @param fname File name.
local function parse_good_plain_list_file(fname)
__parse_file(fname, __parse_plain_list, __is_good, 'good')
end
M.parse_good_plain_list_file = parse_good_plain_list_file
--- Get a custom LanguageTool XML handler.
-- The returned XML handler scans LanguageTool XML data for incorrect
-- spellings. For every incorrect spelling found, the given call-back
-- function is called with the incorrect spelling string as argument.
--
-- XML data is checked for being created by LanguageTool (via attribute
-- software
in tag matches
).
--
-- @param cb Call-back function handling incorrect spellings found in
-- XML data.
-- @return XML handler.
local function __get_XML_handler_LanguageTool(cb)
-- Some flags for checking validity of XML data. LanguageTool XML
-- data must declare as being UTF-8 encoded and advertise as being
-- created by LanguageTool.
local is_XML_encoding_UTF_8 = false
local is_XML_creator_LanguageTool = false
local is_XML_valid = false
--- Handler object for parsing LanguageTool XML data.
-- This table contains call-backs used by LuaXML when parsing XML
-- data.
--
-- @class table
-- @name XML_handler
-- @field decl Handle XML declaration.
-- @field starttag Handle all relevant tags.
-- @field endtag Not used, but mandatory.
local XML_handler = {
decl = function(self, text, attr)
-- Check XML encoding declaration.
if attr.encoding == 'UTF-8' then
is_XML_encoding_UTF_8 = true
is_XML_valid = is_XML_encoding_UTF_8 and is_XML_creator_LanguageTool
else
error('package spelling: Error! XML data not in the UTF-8 encoding.')
end
end,
starttag = function(self, text, attr)
-- Process tag.
if text == 'matches' then
-- Check XML creator is LanguageTool.
if attr and attr.software == 'LanguageTool' then
is_XML_creator_LanguageTool = true
is_XML_valid = is_XML_encoding_UTF_8 and is_XML_creator_LanguageTool
end
-- Check XML data is valid.
elseif not is_XML_valid then
error('package spelling: Error! No valid LanguageTool XML data.')
-- Process tags.
elseif text == 'error' then
local ruleid = attr.ruleid
if ruleid == 'HUNSPELL_RULE'
or ruleid == 'HUNSPELL_NO_SUGGEST_RULE'
or ruleid == 'GERMAN_SPELLER_RULE'
or Ufind(ruleid, '^MORFOLOGIK_RULE_')
then
-- Extract misspelled word from context attribute.
local word = Usub(attr.context, attr.contextoffset + 1, attr.contextoffset + attr.errorlength)
cb(word)
end
end
end,
endtag = function(self, text)
end,
}
return XML_handler
end
--- Parse a string containing LanguageTool XML data.
-- All incorrect spellings found in the given XML data are mapped to the
-- boolean value `true` in the given table.
--
-- @param s String containing XML data.
-- @param t Table mapping incorrect spellings to a boolean.
-- @return Number of total and new incorrect spellings found.
local function __parse_XML_LanguageTool(s, t)
local total_c = 0
local new_c = 0
-- Create call-back for custom LanguageTool XML handler that stores a
-- bad word in the given table and does some statistics.
local cb_incorrect_spelling = function(word)
if not t[word] then
t[word] = true
new_c = new_c + 1
end
total_c = total_c + 1
end
-- Create custom XML handler.
local XML_handler_LT = __get_XML_handler_LanguageTool(cb_incorrect_spelling)
-- Create custom XML parser.
local x = xml.xmlParser(XML_handler_LT)
-- Parse XML data.
x:parse(s)
return total_c, new_c
end
--- Parse LanguageTool XML data read from a file.
-- All strings found in the file (words with known incorrect spelling)
-- are mapped to the boolean value `true` in table `__is_bad`.
--
-- @param fname File name.
local function parse_XML_LanguageTool_file(fname)
__parse_file(fname, __parse_XML_LanguageTool, __is_bad, 'bad')
end
M.parse_XML_LanguageTool_file = parse_XML_LanguageTool_file
--- Parse default sources for bad and good strings.
-- All strings found in default sources for words with known incorrect
-- spelling are mapped to the boolean value `true` in table `__is_bad`.
-- All strings found in default sources for words with known correct
-- spelling are mapped to the boolean value `true` in table `__is_good`.
-- Default sources for bad spellings are files `.spell.xml` (a
-- LanguageTool XML file) and `.spell.bad` (a plain list file).
-- Default sources for good spellings are file `.spell.good` (a
-- plain list file).
local function parse_default_bad_and_good()
local fname, f
-- Try to read bad spellings from LanguageTool XML file
-- '.spell.xml'.
fname = tex.jobname .. '.spell.xml'
f = io.open(fname, 'r')
if f then
f:close()
parse_XML_LanguageTool_file(fname)
end
-- Try to read bad spellings from plain list file
-- '.spell.bad'.
fname = tex.jobname .. '.spell.bad'
f = io.open(fname, 'r')
if f then
f:close()
parse_bad_plain_list_file(fname)
end
-- Try to read good spellings from plain list file
-- '.spell.good'.
fname = tex.jobname .. '.spell.good'
f = io.open(fname, 'r')
if f then
f:close()
parse_good_plain_list_file(fname)
end
end
M.parse_default_bad_and_good = parse_default_bad_and_good
--- Default bad dictionary look-up match rule.
-- This function looks-up both arguments in the list of bad spellings.
-- It returns `true` if either of the arguments is found in the list of
-- bad spellings, otherwise `false`.
--
-- @param raw Raw string to check.
-- @param stripped Same as `raw`, but with stripped surrounding
-- punctuation.
-- @return A boolean value indicating a match.
local function __bad_rule_bad_dictionary_lookup(raw, stripped)
return __is_bad[stripped] or __is_bad[raw]
end
--- Default good dictionary look-up match rule.
-- This function looks-up both arguments in the list of good spellings.
-- It returns `true` if either of the arguments is found in the list of
-- good spellings, otherwise `false`.
--
-- @param raw Raw string to check.
-- @param stripped Same as `raw`, but with stripped surrounding
-- punctuation.
-- @return A boolean value indicating a match.
local function __good_rule_good_dictionary_lookup(raw, stripped)
return __is_good[stripped] or __is_good[raw]
end
--- Load match rule module.
-- Match rule modules are loaded using `require`. The module table must
-- follow the following convention: Indentifiers of bad match rules
-- start `bad_rule_`. Indentifiers of good match rules start
-- `good_rule_`. Other and non-function identifiers are ignore.
--
-- All match rules found in a module are added to the table of bad and
-- good match rules. Arguments of a match rule function are a raw
-- string and the same string with stripped surrounding punctuation.
--
-- @param fname Module file name.
local function read_match_rules(fname)
local bad_c = 0
local good_c = 0
local rules = require(fname)
for k,v in pairs(rules) do
if type(v) == 'function' then
if Sfind(k, '^bad_rule_') then
tabinsert(__rules_bad, v)
bad_c = bad_c + 1
elseif Sfind(k, '^good_rule_') then
tabinsert(__rules_good, v)
good_c = good_c + 1
end
end
end
texio.write_nl('package spelling: Info! ' .. bad_c .. '/' .. good_c .. ' bad/good match rules read from module \'' .. fname .. '\'.')
end
M.read_match_rules = read_match_rules
--- Module initialisation.
--
local function __init()
-- Get local references to package ressources.
__rules_bad = PKG_spelling.res.rules_bad
__rules_good = PKG_spelling.res.rules_good
-- Add default dictionary look-up match rules.
tabinsert(__rules_bad, __bad_rule_bad_dictionary_lookup)
tabinsert(__rules_good, __good_rule_good_dictionary_lookup)
-- Create emtpy lists of known spellings.
__is_bad = {}
__is_good = {}
end
-- Initialize module.
__init()
-- Return module table.
return M