--- spelling-stage-2.lua
--- Copyright 2012, 2013 Stephan Hennig
--
-- This work may be distributed and/or modified under the conditions of
-- the LaTeX Project Public License, either version 1.3 of this license
-- or (at your option) any later version. The latest version of this
-- license is in http://www.latex-project.org/lppl.txt
-- and version 1.3 or later is part of all distributions of LaTeX
-- version 2005/12/01 or later.
--
-- See file README for more information.
--
--- Tag node lists with word strings before hyphenation takes place.
-- This module provides means to tag node lists by inserting
-- user-defined whatsit nodes before and after first and last node
-- belonging to a chain representing a string in the node list. The
-- final tag node contains a reference to a string containing the word
-- string. Tagging is applied before hyphenation takes place.
--
-- @author Stephan Hennig
-- @copyright 2012, 2013 Stephan Hennig
-- @release version 0.41
--
-- @trick Prevent LuaDoc from looking past here for module description.
--[[ Trick LuaDoc into entering 'module' mode without using that command.
module(...)
--]]
-- Module table.
local M = {}
-- Import external modules.
local recurse = require('spelling-recurse')
local unicode = require('unicode')
-- Function short-cuts.
local tabconcat = table.concat
local tabinsert = table.insert
local tabremove = table.remove
local node_new = node.new
local node_insert_after = node.insert_after
local node_insert_before = node.insert_before
local recurse_node_list = recurse.recurse_node_list
local Sfind = string.find
local Sgmatch = string.gmatch
local Smatch = string.match
local Uchar = unicode.utf8.char
local Umatch = unicode.utf8.match
-- Short-cuts for constants.
local DISC = node.id('disc')
local GLYPH = node.id('glyph')
local KERN = node.id('kern')
local WHATSIT = node.id('whatsit')
local LOCAL_PAR = node.subtype('local_par')
local USER_DEFINED = node.subtype('user_defined')
local PDF_COLORSTACK = node.subtype('pdf_colorstack')
-- Declare local variables to store references to resources that are
-- provided by external code.
--
-- Table of bad rules.
local __rules_bad
--
-- Table of good rules.
local __rules_good
--
-- ID of user-defined whatsit nodes marking the start of a word.
local __uid_start_tag
--
-- ID of user-defined whatsit nodes marking the end of a word.
local __uid_end_tag
--- Module options.
-- This table contains all module options. User functions to set
-- options are provided.
--
-- @class table
-- @name __opts
-- @field hl_color Colour used for highlighting bad spellings in PDF
-- output.
local __opts = {
hl_color,
}
--- Set colour used for highlighting.
-- Set colour used for highlighting bad spellings in PDF output. The
-- argument is checked for a valid PDF colour statement. As an example,
-- the string `1 0 0 rg` represents a red colour in the RGB colour
-- space. A similar colour in the CMYK colour space would be
-- represented by the string '0 1 1 0 k'.
--
-- @param col New colour.
local function set_highlight_color(col)
-- Extract all colour components.
local components = Smatch(col, '^(%S+ %S+ %S+) rg$') or Smatch(col, '^(%S+ %S+ %S+ %S+) k$')
local is_valid_arg = components
if is_valid_arg then
-- Validate colour components.
for comp in Sgmatch(components, '%S+') do
-- Check number syntax.
local is_valid_comp = Sfind(comp, '^%d+%.?%d*$') or Sfind(comp, '^%d*%.?%d+$')
if is_valid_comp then
-- Check number range.
comp = tonumber(comp)
is_valid_comp = comp >= 0 and comp <= 1
end
is_valid_arg = is_valid_arg and is_valid_comp
end
end
if is_valid_arg then
__opts.hl_color = col
else
error('package spelling: Error! Invalid PDF colour statement: ' .. tostring(col))
end
end
M.set_highlight_color = set_highlight_color
--- Highlighting status cache table.
-- Determining the highlighting status of a string can be an expensive
-- operation. To reduce average run-time penalty per string,
-- highlighting status of all strings found in a document is cached in
-- this table, so that determining the highlighting status of a known
-- string requires only one table look-up.
--
-- This table needs an `__index` meta method calculating the
-- highlighting status of unknown keys (strings).
--
-- @class table
-- @name __is_highlighting_needed
local __is_highlighting_needed = {}
--- Calculate and cache the highlighting status of a string.
-- First, surrounding punctuation is stripped from the string argument.
-- Then, the given raw as well as the stripped string are checked
-- against all rules. Highlighting of the string is required, if any
-- bad rule matches, but no good rule matches. That is, good rules take
-- precedence over bad rules.
--
-- @param t Original table.
-- @param raw Raw string to check.
-- @return True, if highlighting is required. False, otherwise.
local function __calc_is_highlighting_needed(t, raw)
-- Strip surrounding punctuation from string.
local stripped = Umatch(raw, '^%p*(.-)%p*$')
-- Check for a bad match.
local is_bad = false
for _,matches_bad in ipairs(__rules_bad) do
is_bad = is_bad or matches_bad(raw, stripped)
if is_bad then break end
end
-- Check for a good match.
local is_good = false
for _,matches_good in ipairs(__rules_good) do
is_good = is_good or matches_good(raw, stripped)
if is_good then break end
end
-- Calculate highlighting status.
local status = (is_bad and not is_good) or false
-- Store status in cache table.
rawset(t, raw, status)
-- Return status.
return status
end
-- Set-up meta table for highlighting status cache table.
setmetatable(__is_highlighting_needed, {
__index = __calc_is_highlighting_needed,
})
--- Convert a Unicode code point to a regular UTF-8 encoded string.
-- This function can be used as an `__index` meta method.
--
-- @param t original table
-- @param cp originl key, a Unicode code point
-- @return UTF-8 encoded string corresponding to the Unicode code point.
local function __meta_cp2utf8(t, cp)
return Uchar(cp)
end
--- Table of Unicode code point mappings.
-- This table maps Unicode code point to strings. The mappings are used
-- during text extraction to translate certain Unicode code points to an
-- arbitrary string instead of the corresponding UTF-8 encoded
-- character.
--
-- As an example, by adding an appropriate entry to this table, the
-- single Unicode code point U-fb00 (LATIN SMALL LIGATURE FF) can be
-- resolved into the multi character string 'ff' instead of being
-- converted to the single character string 'ff' ('ff').
--
-- Keys are Unicode code points. Values must be strings in the UTF-8
-- encoding. If a key is not present in this table, the regular UTF-8
-- character is returned by means of a meta table.
--
-- @class table
-- @name __codepoint_map
local __codepoint_map = {
[0x0132] = 'IJ',-- LATIN CAPITAL LIGATURE IJ
[0x0133] = 'ij',-- LATIN SMALL LIGATURE IJ
[0x0152] = 'OE',-- LATIN CAPITAL LIGATURE OE
[0x0153] = 'oe',-- LATIN SMALL LIGATURE OE
[0x017f] = 's',-- LATIN SMALL LETTER LONG S
[0xfb00] = 'ff',-- LATIN SMALL LIGATURE FF
[0xfb01] = 'fi',-- LATIN SMALL LIGATURE FI
[0xfb02] = 'fl',-- LATIN SMALL LIGATURE FL
[0xfb03] = 'ffi',-- LATIN SMALL LIGATURE FFI
[0xfb04] = 'ffl',-- LATIN SMALL LIGATURE FFL
[0xfb05] = 'st',-- LATIN SMALL LIGATURE LONG S T
[0xfb06] = 'st',-- LATIN SMALL LIGATURE ST
}
--- Meta table for code point mapping table.
--
-- @class table
-- @name __meta_codepoint_map
-- @field __index Index operator.
local __meta_codepoint_map = {
__index = __meta_cp2utf8,
}
-- Set meta table for code point mapping table.
setmetatable(__codepoint_map, __meta_codepoint_map)
--- Clear all code point mappings.
-- After calling this function, there are no known code point mappings
-- and no code point mapping takes place during text extraction.
local function clear_all_mappings()
__codepoint_map = {}
setmetatable(__codepoint_map, __meta_codepoint_map)
end
M.clear_all_mappings = clear_all_mappings
--- Manage Unicode code point mappings.
-- This function can be used to set-up code point mappings. First
-- argument must be a number, second argument must be a string in the
-- UTF-8 encoding or `nil`.
--
-- If the second argument is a string, after calling this function, the
-- Unicode code point given as first argument, when found in a node list
-- during text extraction, is mapped to the string given as second
-- argument instead of being converted to a UTF-8 encoded character
-- corresponding to the code point.
--
-- If the second argument is `nil`, a mapping for the given code point,
-- if existing, is deleted.
--
-- @param cp A Unicode code point, e.g., 0xfb00 for the code point LATIN
-- SMALL LIGATURE FF.
-- @param newt New target string to map the code point to or `nil`.
-- @return Old target string the code point was mapped to before
-- (possibly `nil`). If any arguments are invalid, return value is
-- `false`. Arguments are invalid if code point is not of type `number`
-- or not in the range 0 to 0x10ffff or if new target string is neither
-- of type `string` nor `nil`).
local function set_mapping(cp, newt)
-- Prevent from invalid entries in mapping table.
if (type(cp) ~= 'number') or
(cp < 0) or
(cp > 0x10ffff) or
((type(newt) ~= 'string') and (type(newt) ~= 'nil')) then
return false
end
-- Retrieve old mapping.
local oldt = rawget(__codepoint_map, cp)
-- Set new mapping.
__codepoint_map[cp] = newt
-- Return old mapping.
return oldt
end
M.set_mapping = set_mapping
-- First and last nodes known to belong to the current word and their
-- head nodes. These nodes are logged, so that after recognizing the
-- end of a word, they can be tagged by inserting new user-defined
-- whatsit nodes before and after them.
local __curr_word_start_head
local __curr_word_start
local __curr_word_end_head
local __curr_word_end
--- Tag the current word in the node list.
-- Insert tag nodes (user-defined whatsit nodes) into the node list
-- before and after the first and last nodes belonging to the current
-- word. The tag marking the start of a word contains as value an empty
-- string. The tag marking the end of a word contains as value a
-- reference to the word string.
--
-- @param word Word string.
local function __tag_word(word)
-- Check, if start node of current word is a head node. Inserting
-- before head nodes needs special attention. This is not yet
-- implemented.
if (__curr_word_start ~= __curr_word_start_head) then
-- Create new start tag node.
local start_tag = node_new(WHATSIT, USER_DEFINED)
-- Mark whatsit node with module ID, so that we can recognize it
-- later.
start_tag.user_id = __uid_start_tag
-- Value is an empty string.
start_tag.type = 115
start_tag.value = ''
-- Insert start tag before first node belonging to current word.
node_insert_before(__curr_word_start_head, __curr_word_start, start_tag)
end
-- Create new end tag node.
local end_tag = node_new(WHATSIT, USER_DEFINED)
-- Mark whatsit node with module ID, so that we can recognize it
-- later.
end_tag.user_id = __uid_end_tag
-- Value of end tag is an index (a number).
end_tag.type = 115
end_tag.value = word
-- Insert end tag after last node belonging to current word.
node_insert_after(__curr_word_end_head, __curr_word_end, end_tag)
end
--- Highlight bad spelling by colour.
-- Insert colour whatsits before and after the first and last nodes
-- known to belong to the current word.
local function __highlight_by_color()
-- Check, if start node of current word is a head node. Inserting
-- before head nodes needs special attention. This is not yet
-- implemented.
if (__curr_word_start ~= __curr_word_start_head) then
-- Create pdf_colorstack whatsit nodes.
local push = node_new(WHATSIT, PDF_COLORSTACK)
local pop = node_new(WHATSIT, PDF_COLORSTACK)
push.stack = 0
pop.stack = 0
push.command = 1
pop.command = 2
push.data = __opts.hl_color
node_insert_before(__curr_word_start_head, __curr_word_start, push)
node_insert_after(__curr_word_end_head, __curr_word_end, pop)
end
end
--- Highlight bad spelling by colour (using node field `cmd`).
-- Insert colour whatsits before and after the first and last nodes
-- known to belong to the current word.
-- @see function __highlight_by_color
local function __highlight_by_color_cmd()
-- Check, if start node of current word is a head node. Inserting
-- before head nodes needs special attention. This is not yet
-- implemented.
if (__curr_word_start ~= __curr_word_start_head) then
-- Create pdf_colorstack whatsit nodes.
local push = node_new(WHATSIT, PDF_COLORSTACK)
local pop = node_new(WHATSIT, PDF_COLORSTACK)
push.stack = 0
pop.stack = 0
push.cmd = 1
pop.cmd = 2
push.data = __opts.hl_color
node_insert_before(__curr_word_start_head, __curr_word_start, push)
node_insert_after(__curr_word_end_head, __curr_word_end, pop)
end
end
--- Generic function for highlighting bad spellings.
local function __highlight_bad_word()
__highlight_by_color()
end
-- Tagging status.
local __is_active_tagging
-- Highlighting status.
local __is_active_highlighting
--- Data structure that stores the characters of a word string.
-- The current word data structure is an ordered list (an array) of the
-- characters of the word. The characters are collected while scanning
-- a node list. They are concatenated to a single string only after the
-- end of a word is detected, before inserting the current word into the
-- current paragraph data structure.
--
-- @class table
-- @name __curr_word
local __curr_word
--- Act upon detection of end of current word string.
-- If the current word contains visible characters, store the current
-- word in the current tag.
local function __finish_current_word()
-- Finish a word?
if __curr_word then
local word = tabconcat(__curr_word)
-- Check, if the current word has already been tagged. This is only
-- a quick hack.
local start_prev = __curr_word_start.prev
local end_next = __curr_word_end.next
if start_prev and end_next
and (start_prev.id == WHATSIT)
and (start_prev.subtype == USER_DEFINED)
and (start_prev.user_id == __uid_start_tag)
and (end_next.id == WHATSIT)
and (end_next.subtype == USER_DEFINED)
and (end_next.user_id == __uid_end_tag)
and (end_next.value == word) then
__curr_word = nil
__curr_word_start_head = nil
__curr_word_start = nil
__curr_word_end_head = nil
__curr_word_end = nil
return
end
-- Tag node list with word string.
if __is_active_tagging then
__tag_word(word)
end
-- Highlighting needed?
if __is_highlighting_needed[word] and __is_active_highlighting then
__highlight_bad_word()
end
__curr_word = nil
end
__curr_word_start_head = nil
__curr_word_start = nil
__curr_word_end_head = nil
__curr_word_end = nil
end
--- Act upon detection of end of current paragraph.
-- If the current paragraph contains words, store the current paragraph
-- in the text document.
local function __finish_current_paragraph()
-- Finish current word.
__finish_current_word()
end
--- Paragraph management stack.
-- Stack of boolean flags, that are used for logging the occurence of a
-- new paragraph within nested vlists.
local __is_vlist_paragraph
--- Paragraph management.
-- This function puts a new boolean flag onto a stack that is used to
-- log the occurence of a new paragraph, while recursing into the coming
-- vlist. After finishing recursing into the vlist, the flag needs to
-- be removed from the stack. Depending on the flag, the then current
-- paragraph can be finished.
local function __vlist_pre_recurse()
tabinsert(__is_vlist_paragraph, false)
end
--- Paragraph management.
-- Remove flag from stack after recursing into a vlist. If necessary,
-- finish the current paragraph.
local function __vlist_post_recurse()
local p = tabremove(__is_vlist_paragraph)
if p then
__finish_current_paragraph()
end
end
--- Find paragraphs and strings.
-- While scanning a node list, this call-back function finds nodes
-- representing the start of a paragraph (local_par whatsit nodes) and
-- strings (chains of nodes of type glyph, kern, disc).
--
-- @param head Head node of current branch.
-- @param n The current node.
local function __visit_node(head, n)
local nid = n.id
-- Test for word string component node.
if nid == GLYPH then
-- Save first node belonging to current word and its head for later
-- reference.
if not __curr_word_start then
__curr_word_start_head = head
__curr_word_start = n
end
-- Save latest node belonging to current word and its head for later
-- reference.
__curr_word_end_head = head
__curr_word_end = n
-- Provide new empty word, if necessary.
if not __curr_word then
__curr_word = {}
end
-- Append character to current word string.
tabinsert(__curr_word, __codepoint_map[n.char])
-- Test for other word string component nodes.
elseif (nid == DISC) or (nid == KERN) then
-- We're still within the current word string. Do nothing.
-- Test for paragraph start.
elseif (nid == WHATSIT) and (n.subtype == LOCAL_PAR) then
__finish_current_paragraph()
__is_vlist_paragraph[#__is_vlist_paragraph] = true
else
-- End of current word string detected.
__finish_current_word()
end
end
--- Table of call-back functions for node list recursion: store the
--- word strings found in a node list.
-- The call-back functions in this table identify chains of nodes
-- representing word strings in a node list and stores the strings in
-- the text document. Local_par whatsit nodes are word boundaries.
-- Nodes of type `hlist` are recursed into as if they were non-existent.
-- As an example, the LaTeX input `a\mbox{a b}b` is recognized as two
-- strings `aa` and `bb`.
--
-- @class table
-- @name __cb_tag_words
-- @field vlist_pre_recurse Paragraph management.
-- @field vlist_post_recurse Paragraph management.
-- @field visit_node Find nodes representing paragraphs and words.
local __cb_tag_words = {
vlist_pre_recurse = __vlist_pre_recurse,
vlist_post_recurse = __vlist_post_recurse,
visit_node = __visit_node,
}
--- Process node list according to this stage.
-- This function recurses into the given node list, extracts all text
-- and stores it in the text document.
--
-- @param head Node list.
local function __process_node_list(head)
__curr_word_start_head = nil
__curr_word_start = nil
__curr_word_end_head = nil
__curr_word_end = nil
recurse_node_list(head, __cb_tag_words)
-- Clean-up left-over word and/or paragraph.
__finish_current_paragraph()
end
--- Call-back function that processes the node list.
--
-- @param head Node list.
local function __cb_pre_linebreak_filter_pkg_spelling(head)
__process_node_list(head)
return head
end
--- Start tagging text.
-- After calling this function, words are tagged in node lists before
-- hyphenation takes place.
local function enable_text_tagging()
__is_active_tagging = true
end
M.enable_text_tagging = enable_text_tagging
--- Stop tagging text.
-- After calling this function, no more word tagging in node lists takes
-- place.
local function disable_text_tagging()
__is_active_tagging = false
end
M.disable_text_tagging = disable_text_tagging
--- Start highlighting bad spellings.
-- After calling this function, bad spellings are highlighted in PDF
-- output.
local function enable_word_highlighting()
__is_active_highlighting = true
end
M.enable_word_highlighting = enable_word_highlighting
--- Stop highlighting bad spellings.
-- After calling this function, no more bad spellings are highlighted in
-- PDF output.
local function disable_word_highlighting()
__is_active_highlighting = false
end
M.disable_word_highlighting = disable_word_highlighting
--- Try to maintain compatibility with older LuaTeX versions.
-- Between LuaTeX 0.70.2 and 0.76.0 node field `cmd` of `whatsits` nodes
-- of subtype `pdf_colorstack` has been renamed to `command`. This
-- function checks which node field is the correct one and activates a
-- fall-back function in case. Due to a bug in LuaTeX 0.76.0 (shipped
-- with TL2013) function `node.has_field()` doesn't return correct
-- results. It is therefore tested if an assignment to field `command`
-- raises an error or not.
local function __maintain_compatibility()
-- Create pdf_colorstack whatsit node.
local n = node.new(WHATSIT, PDF_COLORSTACK)
-- Function that assigns a value to node field 'command'.
local f = function()
n.command = 1
end
-- If the assignment is not successful, fall-back to node field 'cmd'.
if not pcall(f) then
__highlight_by_color = __highlight_by_color_cmd
end
-- Delete test node.
node.free(n)
end
--- Module initialisation.
--
local function __init()
-- Try to maintain compatibility with older LuaTeX versions.
__maintain_compatibility()
-- Get local references to package ressources.
__rules_bad = PKG_spelling.res.rules_bad
__rules_good = PKG_spelling.res.rules_good
__uid_start_tag = PKG_spelling.res.whatsit_ids.start_tag
__uid_end_tag = PKG_spelling.res.whatsit_ids.end_tag
-- Create empty paragraph management stack.
__is_vlist_paragraph = {}
-- Remember tagging status.
__is_active_tagging = false
-- Remember highlighting status.
__is_active_highlighting = false
-- Set default highlighting colour.
set_highlight_color('1 0 0 rg')
-- Register call-back: Before TeX breaks a paragraph into lines, tag
-- and highlight strings.
luatexbase.add_to_callback('pre_linebreak_filter', __cb_pre_linebreak_filter_pkg_spelling, '__cb_pre_linebreak_filter_pkg_spelling')
end
-- Initialize module.
__init()
-- Return module table.
return M