--- spelling-stage-3.lua
--- Copyright 2012, 2013 Stephan Hennig
--
-- This work may be distributed and/or modified under the conditions of
-- the LaTeX Project Public License, either version 1.3 of this license
-- or (at your option) any later version. The latest version of this
-- license is in http://www.latex-project.org/lppl.txt
-- and version 1.3 or later is part of all distributions of LaTeX
-- version 2005/12/01 or later.
--
-- See file README for more information.
--
--- Store the text of a LuaTeX document in a text document data
--- structure.
-- This module provides means to extract text from a LuaTeX document and
-- to store it in a text document data structure.
--
-- In the text document, words are stored as UTF-8 encoded strings. A
-- mapping mechanism is provided by which, during word string
-- recognition, individual code-points, e.g., of glyph nodes, can be
-- translated to arbitrary UTF-8 strings, e.g., ligatures to single
-- letters.
--
-- @author Stephan Hennig
-- @copyright 2012, 2013 Stephan Hennig
-- @release version 0.41
--
-- @trick Prevent LuaDoc from looking past here for module description.
--[[ Trick LuaDoc into entering 'module' mode without using that command.
module(...)
--]]
-- Module table.
local M = {}
-- Import external modules.
local recurse = require('spelling-recurse')
-- Function short-cuts.
local recurse_node_list = recurse.recurse_node_list
local tabinsert = table.insert
local tabremove = table.remove
-- Short-cuts for constants.
local WHATSIT = node.id('whatsit')
local LOCAL_PAR = node.subtype('local_par')
local USER_DEFINED = node.subtype('user_defined')
-- Declare local variables to store references to resources that are
-- provided by external code.
--
-- Text document data structure.
local __text_document
--
-- ID of user-defined whatsit nodes marking the start of a word.
local __uid_start_tag
--
-- ID of user-defined whatsit nodes marking the end of a word.
local __uid_end_tag
--- Module options.
-- This table contains all module options. User functions to set
-- options are provided.
--
-- @class table
-- @name __opts
-- @field table_par When processing a table, when should paragraphs be
-- inserted into the text document?
--
--
-- - 0 - Don't touch tables in any way.
-- - 1 - Insert paragraphs before and after hlists of type
-- alignment column or row, i.e., before and after
-- every table row.
-- - 2 - Insert paragraphs before and after hlists of type
-- alignment cell, i.e., before and after every table
-- cell.
--
local __opts = {
table_par,
}
--- Set table behaviour.
-- Determine when paragraphs are inserted within tables.
--
-- @param value New value.
local function set_table_paragraphs(value)
__opts.table_par = value
end
M.set_table_paragraphs = set_table_paragraphs
--- Data structure that stores the word strings found in a node list.
--
-- @class table
-- @name __curr_paragraph
local __curr_paragraph
--- Act upon detection of end of current word string.
-- If the current word contains visible characters, store the current
-- word in the current paragraph.
--
-- @param n String tag node.
local function __finish_current_word(n)
-- Provide new empty paragraph, if necessary.
if not __curr_paragraph then
__curr_paragraph = {}
end
-- Append current string to current paragraph.
tabinsert(__curr_paragraph, n.value)
end
--- Act upon detection of end of current paragraph.
-- If the current paragraph contains words, store the current paragraph
-- in the text document.
local function __finish_current_paragraph()
-- Finish a paragraph?
if __curr_paragraph then
-- Append current paragraph to document structure.
tabinsert(__text_document, __curr_paragraph)
__curr_paragraph = nil
end
end
--- Paragraph management stack.
-- Stack of boolean flags, that are used for logging the occurence of a
-- new paragraph within nested vlists.
local __is_vlist_paragraph
--- Paragraph management.
-- This function puts a new boolean flag onto a stack that is used to
-- log the occurence of a new paragraph, while recursing into the coming
-- vlist. After finishing recursing into the vlist, the flag needs to
-- be removed from the stack. Depending on the flag, the then current
-- paragraph can be finished.
local function __vlist_pre_recurse()
tabinsert(__is_vlist_paragraph, false)
end
--- Paragraph management.
-- Remove flag from stack after recursing into a vlist. If necessary,
-- finish the current paragraph.
local function __vlist_post_recurse()
local p = tabremove(__is_vlist_paragraph)
if p then
__finish_current_paragraph()
end
end
--- Handle tables lines and cells.
-- Start a new paragraph before and after an hlist of subtype `alignment
-- column or row` or `alignment cell`, depending on option `table_par`.
--
-- @param n hlist node.
local function __handle_table(n)
local subtype = n.subtype
local table_par = __opts.table_par
if (subtype == 4) and (table_par == 1) then
__finish_current_paragraph()
elseif (subtype == 5) and (table_par == 2) then
__finish_current_paragraph()
end
end
--- Find paragraphs and strings.
-- While scanning a node list, this call-back function finds nodes
-- representing the start of a paragraph (local_par whatsit nodes) and
-- string tags (user_defined whatsit nodes).
--
-- @param head Head node of current branch.
-- @param n The current node.
local function __visit_node(head, n)
local nid = n.id
-- Test for node containing a word string.
if nid == WHATSIT then
-- Test for word string tag.
if (n.subtype == USER_DEFINED) and (n.user_id == __uid_end_tag) then
__finish_current_word(n)
-- Test for paragraph start.
elseif n.subtype == LOCAL_PAR then
__finish_current_paragraph()
__is_vlist_paragraph[#__is_vlist_paragraph] = true
end
end
end
--- Table of call-back functions for node list recursion: store the
--- word strings found in a node list.
-- The call-back functions in this table identify chains of nodes
-- representing word strings in a node list and stores the strings in
-- the text document. A new paragraph is started at local_par whatsit
-- nodes and after finishing a vlist containing a local_par whatsit
-- node. Nodes of type `hlist` are recursed into as if they were
-- non-existent. As an example, the LaTeX input `a\mbox{a b}b` is
-- recognized as two strings `aa` and `bb`.
--
-- @class table
-- @name __cb_store_words
-- @field vlist_pre_recurse Paragraph management.
-- @field vlist_post_recurse Paragraph management.
-- @field hlist_pre_recurse Table management.
-- @field hlist_post_recurse Table management.
-- @field visit_node Find nodes representing paragraphs and words.
local __cb_store_words = {
vlist_pre_recurse = __vlist_pre_recurse,
vlist_post_recurse = __vlist_post_recurse,
hlist_pre_recurse = __handle_table,
hlist_post_recurse = __handle_table,
visit_node = __visit_node,
}
--- Process node list according to this stage.
-- This function recurses into the given node list, finds strings in
-- tags and stores them in the text document.
--
-- @param head Node list.
local function __process_node_list(head)
recurse_node_list(head, __cb_store_words)
-- Clean-up left-over word and/or paragraph.
__finish_current_paragraph()
end
-- Call-back status.
local __is_active_storage
--- Call-back function that processes the node list.
-- This function is not made available in the module table, but in
-- the global package table!
--
-- @param head Node list.
local function cb_AtBeginShipout(box)
if __is_active_storage then
__process_node_list(tex.box[box])
end
end
--- Start storing text.
-- After calling this function, text is stored in the text document.
local function enable_text_storage()
__is_active_storage = true
end
M.enable_text_storage = enable_text_storage
--- Stop storing text.
-- After calling this function, no more text is stored in the text
-- document.
local function disable_text_storage()
__is_active_storage = false
end
M.disable_text_storage = disable_text_storage
--- Module initialisation.
--
local function __init()
-- Get local references to package ressources.
__text_document = PKG_spelling.res.text_document
__uid_start_tag = PKG_spelling.res.whatsit_ids.start_tag
__uid_end_tag = PKG_spelling.res.whatsit_ids.end_tag
-- Make \AtBeginShipout function available in package table.
PKG_spelling.cb_AtBeginShipout = cb_AtBeginShipout
-- Create empty paragraph management stack.
__is_vlist_paragraph = {}
-- Remember call-back status.
__is_active_storage = false
-- Set default table paragraph behaviour.
set_table_paragraphs(0)
end
-- Initialize module.
__init()
-- Return module table.
return M