% \iffalse meta-comment % % Copyright (C) 2011-2018 by Enrico Gregorio % % ------------------------------------------------------- % % This work may be distributed and/or modified under the % conditions of the LaTeX Project Public License, either % version 1.3c of this license or (at your option) any % later version. The latest version of this license is in % http://www.latex-project.org/lppl.txt % and version 1.3 or later is part of all distributions % of LaTeX version 2005/12/01 or later. % % This work has the LPPL maintenance status `maintained'. % % The Current Maintainer of this work is Enrico Gregorio. % % This work consists of the files % newunicodechar.dtx % newunicodechar.ins % and the derived file newunicodechar.sty. % % \fi % % \iffalse %<*driver> \ProvidesFile{newunicodechar.dtx} % %\NeedsTeXFormat{LaTeX2e}[2018/04/01] %\ProvidesPackage{newunicodechar} %<*package> [2018/04/08 v1.2 Defining Unicode characters] % % %<*driver> \documentclass{ltxdoc} \usepackage[T1]{fontenc} \usepackage{metalogo,booktabs,lmodern,textcomp,wasysym} \EnableCrossrefs \CodelineIndex \RecordChanges \begin{document} \DocInput{newunicodechar.dtx} \PrintChanges \PrintIndex \end{document} % % \fi % % \CheckSum{201} % % \CharacterTable % {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z % Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z % Digits \0\1\2\3\4\5\6\7\8\9 % Exclamation \! Double quote \" Hash (number) \# % Dollar \$ Percent \% Ampersand \& % Acute accent \' Left paren \( Right paren \) % Asterisk \* Plus \+ Comma \, % Minus \- Point \. Solidus \/ % Colon \: Semicolon \; Less than \< % Equals \= Greater than \> Question mark \? % Commercial at \@ Left bracket \[ Backslash \\ % Right bracket \] Circumflex \^ Underscore \_ % Grave accent \` Left brace \{ Vertical bar \| % Right brace \} Tilde \~} % % \changes{v1.2}{2018/04/08}{Now utf8 is default} % \changes{v1.1}{2012/11/12}{Added support for \mbox{\textsf{inputenx}}} % \changes{v1.0}{2011/02/15}{Initial version} % % \GetFileInfo{newunicodechar.dtx} % % \DoNotIndex{\newcommand,\newenvironment,\!,\@empty,\@gobble,\@gobbletwo} % \DoNotIndex{\@ifpackageloaded,\@ifpackagewith,\@ifundefined,\@namedef} % \DoNotIndex{\@nil,\@onlypreamble,\@tempa,\@tempb,\@tempswafalse,\def} % \DoNotIndex{\@tempswatrue,\^,\-,\active,\begingroup,\catcode,\@car,\@cdr} % \DoNotIndex{\edef,\else,\endgroup,\endinput,\expandafter,\fi,\if} % \DoNotIndex{\if@tempswa,\ifcase,\ifnum,\ifx,\lccode,\let,\lowercase} % \DoNotIndex{\MessageBreak,\next,\number,\numexpr,\or,\PackageError} % \DoNotIndex{\PackageWarning,\PackageWarningNoLine,\strip@prefix,\@@end} % \DoNotIndex{\relax,\space,\string,\DeclareOption,\ProcessOptions} % \DoNotIndex{\meaning,\ifdefined,\csname,\chardef,\endcsname,\protect} % % \title{The \textsf{newunicodechar} package\thanks{This document % corresponds to \textsf{newunicodechar}~\fileversion, dated \filedate.}} % \author{Enrico Gregorio \\ \texttt{Enrico dot Gregorio at univr dot it}} % % \maketitle % % \section{Introduction} % % When using Unicode input with \LaTeX{} it's not so uncommon to get an % incomprehensible error message such as %\begin{verbatim} %Unicode char \u8:xxx not set up for use with LaTeX %\end{verbatim} % where |xxx| may be the actual character we input or a combination of % strange characters. This happens because the \texttt{utf8} option % given to \textsf{inputenc} or \textsf{inputenx} defines the \LaTeX{} % meaning of many Unicode characters, but, of course, not all of them. % % For example, one might want to write some Latin words with prosodic % marks, i.e., the diacritics that tell whether a vowel is long or % short, in order to distinguish between `p\u{o}p\u{u}lus' (people) % and `p\=op\u{u}lus' (poplar), but using the actual Unicode % characters that make the \LaTeX{} document easier to read; look at % the following table, where on the left the input is via the \LaTeX{} % Internal Character Representation (LICR) and on the right it's via % Unicode characters, % \begin{center} % \begin{tabular}{cc} % \toprule % LICR & Unicode \\ % \midrule % \verb|p\u{o}p\u{u}lus| & \texttt{p\u{o}p\u{u}lus}\\ % \verb|p\={o}p\u{u}lus| & \texttt{p\={o}p\u{u}lus}\\ % \bottomrule % \end{tabular} % \end{center} % and judge by yourselves which one is better. Unfortunately, the % \texttt{utf8} option to \textsf{inputenc} doesn't define a meaning % for \texttt{\u{o}}, \texttt{\={o}}, and \texttt{\u{u}}. As a matter % of fact, only \texttt{\u{a}} and \texttt{\u{A}} are defined, as they % are used in the Romanian language. % % One might resort to |\DeclareUnicodeCharacter| in the document's % preamble, but this requires looking up at the (long) list of Unicode % characters and jotting down the relevant numbers. For example, % \texttt{\u{o}} is \texttt{U+014F}, so the declaration %\begin{verbatim} %\DeclareUnicodeCharacter{014F}{\u{o}} %\end{verbatim} % would do for \texttt{\u{o}}. % % The present package introduces a simpler interface that frees the % user from the burden to look up in the tables: all it's needed is % \begin{flushleft} % |\newunicodechar{|\texttt{\u{o}}|}{\u{o}}| % \end{flushleft} % You are not restricted to definition like this: for example, % \eighthnote{}~is Unicode \texttt{U+266A}, but you are not required % to know it: if your editor can insert the character~\eighthnote, you % may define its meaning by loading a package that provides it and say % \begin{flushleft} % |\usepackage{wasysym}|\\ % |\newunicodechar{|\eighthnote|}{\eighthnote}| % \end{flushleft} % Important note: the package will not work if the \textsf{inputenc} package % is called with an option different from \texttt{utf8}. As of April 2018, % this encoding is assumed by default in 8~bit \TeX{} engines when running % \LaTeX. However one can still load \textsf{inputenc} or \textsf{inputenx} % with the \texttt{utf8} option (not \texttt{utf8x}). % % A similar problem may arise even with \XeLaTeX{}. A frequently asked % question on mailing lists or discussion groups is how to print some % particular character in a different font than the main one of the % document, say, for example, the Euro sign which, in some fonts, is % horrible. The usual answer is to write something like % \begin{flushleft} % |\newfontfamily{\eurofont}{|\meta{some font}|}|\\ % |\catcode`|\texttt{\texteuro}|=\active|\\ % |\protected\def |\texttt{\texteuro}|{{\eurofont\char`\|^^A % \texttt{\texteuro}|}}| % \end{flushleft} % which, for the average user, is somewhat scaring. With % \textsf{newunicodechar} this may be simplified into % \begin{flushleft} % |\newfontfamily{\eurofont}{|\meta{some font}|}|\\ % |\newunicodechar{|\texttt{\texteuro}|}{{\eurofont\texteuro}}| % \end{flushleft} % % \section{Usage} % % The package requires the use of a Unicode engine, i.e., \XeLaTeX{} % or \LuaLaTeX{}, or, with (pdf)\LaTeX{}, the \textsf{inputenc} or % \textsf{inputenx} package along with the \texttt{utf8} option. It % won't work with the \texttt{utf8x} option that employs a completely % different mechanism for parsing Unicode characters in % (pdf)\LaTeX{}. It should be said that \texttt{utf8x} defines many % more characters than \texttt{utf8}, so that the present package % wouldn't be needed. % % Of course the \LaTeX{} document must be written using a Unicode % savvy editor. % % \bigskip % % The package has only one option, \texttt{verbose}^^A % \marginpar{\raggedleft\texttt{verbose}}^^A % \index{verbose=\texttt{verbose} option|usage}, which is off by % default. If the package is called by % |\usepackage[verbose]{newunicodechar}|, then the informative message % on the log file will show the old definition along with the warning % about the redefinition. Unfortunately this definition usually has a % rather cryptic format; for example, redefining \texttt{\u{a}} would % print %\begin{verbatim} %Redefining Unicode character; it meant %*** \IeC {\u a} *** %before your redefinition on input line 22. %\end{verbatim} % Call the package with this option if you are worried about what you % are redefining, but the meaning of the Unicode character should % correspond easily to just one LICR entry. This option does nothing % when a Unicode engine is used, because no character is active % initially (except for~|~|), so there should be non risk to redefine % anything. % % \DescribeMacro{\newunicodechar} % The package provides only one command, |\newunicodechar|, which must % be called with two arguments: % \begin{flushleft} % |\newunicodechar{|\meta{char}|}{|\meta{code}|}| % \end{flushleft} % where \meta{char} is the Unicode character to which we need to give % a meaning and \meta{code} is that meaning, that is the \LaTeX{} code % that will be substituted to the character. Here is what's needed for % the prosodic marks in Latin: % \begin{flushleft} % |%\newunicodechar{|\texttt{\u{A}}|}{\u{A}}| % |\newunicodechar{|\texttt{\u{a}}|}{\u{a}}|\\ % |\newunicodechar{|\texttt{\u{E}}|}{\u{E}}| % |\newunicodechar{|\texttt{\u{e}}|}{\u{e}}|\\ % |\newunicodechar{|\texttt{\u{I}}|}{\u{I}}| % |\newunicodechar{|\texttt{\u{\i}}|}{\u{\i}}|\\ % |\newunicodechar{|\texttt{\u{O}}|}{\u{O}}| % |\newunicodechar{|\texttt{\u{o}}|}{\u{o}}|\\ % |\newunicodechar{|\texttt{\u{U}}|}{\u{U}}| % |\newunicodechar{|\texttt{\u{u}}|}{\u{u}}|\\ % |\newunicodechar{|\texttt{\={A}}|}{\={A}}| % |\newunicodechar{|\texttt{\={a}}|}{\={a}}|\\ % |\newunicodechar{|\texttt{\={E}}|}{\={E}}| % |\newunicodechar{|\texttt{\={e}}|}{\={e}}|\\ % |\newunicodechar{|\texttt{\={I}}|}{\={I}}| % |\newunicodechar{|\texttt{\={\i}}|}{\={\i}}|\\ % |\newunicodechar{|\texttt{\={O}}|}{\={O}}| % |\newunicodechar{|\texttt{\={o}}|}{\={o}}|\\ % |\newunicodechar{|\texttt{\={U}}|}{\={U}}| % |\newunicodechar{|\texttt{\={u}}|}{\={u}}| % \end{flushleft} % The first line is commented out, because those characters are % already defined. It doesn't hurt to give again a definition, % \LaTeX{} will just warn about it. % % Caution: when used with \XeLaTeX{} or \LuaLaTeX{}, there will be no % warning, but the standard setup doesn't define any active character. % % The first argument \emph{must} consist of a single Unicode % character, but the package checks for it and raises an error % otherwise; it will raise an error also if the first argument % consists of a plain ASCII character: defining them is not allowed in % (pdf)\LaTeX{} and would break almost everything in \XeLaTeX{} or % \LuaLaTeX{}. % % \section{An easier way?} % % One could dispense with this package, since the same effect may be % obtained in the way we have already seen in \XeLaTeX{} or \LuaLaTeX{}; % with (pdf)\LaTeX{}, calling |\newunicodechar{|\texttt{\={u}}|}{\={u}}| % is equivalent to say % \begin{flushleft} % |\makeatletter|\\ % |\@namedef{u8:\detokenize{|\texttt{\={u}}|}}{\={u}}|\\ % |\makeatother| % \end{flushleft} % % \StopEventually{} % % \section{Implementation} % % The usual presentation, that we repeat here for completeness. %\begin{verbatim} %\ProvidesFile{newunicodechar.dtx} %\NeedsTeXFormat{LaTeX2e}[2008/04/05] %\ProvidesPackage{newunicodechar} %\end{verbatim} % The date for the format has been chosen to match the last version of % the \texttt{utf8enc.dfu} file that provides definitions for Unicode % characters. % % Now the real macros. First of all we check that the typesetting % engine is sufficiently recent to include $\varepsilon$-\TeX{} % extensions. % \begin{macrocode} \@ifundefined{eTeXversion} {\PackageError{newunicodechar}{LaTeX engine too old, aborting} {Please upgrade your TeX system}\@@end}{} % \end{macrocode} % % \subsection{Options} % There's only one option. % \begin{macrocode} \DeclareOption{verbose}{\let\nuc@verbose=T} \ProcessOptions\relax % \end{macrocode} % \subsection{Error messages} % \begin{macrocode} \def\nuc@onebyteerr{\PackageError{newunicodechar} {ASCII character requested} {Only characters above U+007F may be defined; you asked for\MessageBreak a plain ASCII character and your definition has been ignored.}} \def\nuc@emptyargerr{\PackageError{newunicodechar} {Empty argument} {You shouldn't write \protect\newunicodechar{}{...}}} \def\nuc@invalidargerr{\PackageError{newunicodechar} {Invalid argument} {The first argument to \protect\newunicodechar\space is either\MessageBreak too long or an invalid sequence of bytes}} % \end{macrocode} % % \subsection{Unicode engines} % In case we are running a Unicode engine (\texttt{xelatex} or % \texttt{lualatex}), the definition of the main macro is easier: we % check whether the character is above $127$ and, in this case, we % make it active expanding to the second argument. This definition of % the main macro will be seen only if the engine has the |^^^^| % convention for inputting characters in hexadecimal format, so that % |^^^^0021| is one token and |\@gobble| will eat it up, making % |\next| equal to |\@empty|; with an eight bit engine (\texttt{latex} % or \texttt{pdflatex}), |\@gobble| will swallow only |^^^|. % % In this case we define the main macro all in one swoop; first we % check that the first argument is nonempty, then we act only if it % consists of only one (character) token. Then if the charcode of this % token is less than $127$ we emit an error message; otherwise we % activate the character and define its (protected) expansion to be % the second argument. The last action, in this case, is to allow % |\newunicodechar| only in the preamble. Then we do |\endinput|. If % the engine is not Unicode savvy, everything up to the closing |\fi| % is swallowed up. % % \begin{macro}{\newunicodechar} % Here is the code for defining the Unicode engine version of the main % macro. Notice that we try being on the safe side by not assuming any % particular category code for~|~|, because we restore it after giving % the definition where we need it to be active. % \begin{macrocode} \begingroup \catcode`\^=7 \catcode30=12 \catcode`\!=12 % for safety \edef\next{\@gobble^^^^0021} \expandafter\endgroup \ifx\next\@empty % Start of code for Unicode engines \chardef\nuc@atcode=\catcode`\~ \catcode`\~=\active \def\newunicodechar#1#2{% \if\relax\detokenize{#1}\relax \nuc@emptyargerr \else \if\relax\detokenize\expandafter{\@cdr#1\@nil}\relax \ifnum`#1>\string"7F \catcode`#1=\active \begingroup\lccode`\~=`#1 \lowercase{\endgroup\protected\def~}{#2}% \else \nuc@onebyteerr \fi \else \nuc@invalidargerr \fi \fi} \catcode`\~=\nuc@atcode \@onlypreamble\newunicodechar \expandafter\endinput \fi % End of code for Unicode engines % \end{macrocode} % \end{macro} % % \subsection{Eight bit engines} % From now on we can assume an eight bit engine is used; we check that % \textsf{inputenc} or \textsf{inputenx} has been loaded with the % right option, otherwise we define |\newunicodechar| to swallow its % arguments, after a warning. We need to first check for % \textsf{inputenx} because it loads \textsf{inputenc}. % \changes{v1.2}{2018/04/08}{Now utf8 is default} % \changes{v1.1}{2012/11/12}{Added support for \mbox{\textsf{inputenx}}} % \begin{macrocode} \def\nuc@stop{\PackageWarningNoLine{newunicodechar} {This package only works if the document\MessageBreak encoding is `utf8'}% \let\newunicodechar\@gobbletwo\endinput} \edef\@tempa{\detokenize{utf8}} \edef\@tempb{\detokenize\expandafter{\inputencodingname}} \ifx\@tempb\@tempa\else \nuc@stop \fi % \end{macrocode} % \begin{macro}{\newunicodechar} % The main macro. We set the temporary switch to false and put in % |\nuc@tempa| the first argument, but detokenized, since it consists of % active characters. We check that it's not empty and access to its % first token that we put into |\@tempb|. Then we call |\nuc@check| % and if it sets the temporary switch to true, we execute the % definition, since the first argument is a valid UTF-8 character, but % we issue a warning if it already has a meaning. All we need to do is % to define the macro |\csname u8:\nuc@tempa\endcsname|, because that's % how \textsf{inputenc} acts. The part between |\ifdefined| and the % corresponding |\fi| will be executed only with the \texttt{verbose} % option. % \begin{macrocode} \def\newunicodechar#1#2{% \@tempswafalse \edef\nuc@tempa{\detokenize{#1}}% \if\relax\nuc@tempa\relax \nuc@emptyargerr \else \edef\@tempb{\expandafter\@car\nuc@tempa\@nil}% \nuc@check \if@tempswa \@ifundefined{u8:\nuc@tempa}{} {\PackageWarning{newunicodechar} {Redefining Unicode character\ifdefined\nuc@verbose; it meant\MessageBreak ***\space\space\nuc@meaning\space\space***\MessageBreak before your redefinition\fi}}% \@namedef{u8:\nuc@tempa}{#2}% \fi \fi } % \end{macrocode} % \end{macro} % The first helper macro computes the number of bytes in the first % argument, though it appears to the user as a single character. The % third is used for the \texttt{verbose} option. % \begin{macrocode} \def\nuc@getlength#1{% \ifx#1\@nil \expandafter\relax \else +1\expandafter\nuc@getlength \fi} \ifdefined\nuc@verbose \def\nuc@meaning{\expandafter\expandafter\expandafter \strip@prefix\expandafter\meaning\csname u8:\nuc@tempa\endcsname} \fi % \end{macrocode} % We select the action based on the length of the first argument; in % case only one byte appears, the user is trying to define an ASCII % character, which is not allowed; if the input is two, three, or four % bytes long, we check whether the first byte has the correct form: % its binary form must be, respectively, \texttt{110xxxxx}, % \texttt{1110xxxx}, or \texttt{11110xxx}, so not less than $192$, % $224$, or $240$. The macro |\nuc@ch@ck| does precisely this, issuing % an error message if the argument is invalid or else setting the % temporary switch to true. % \begin{macrocode} \def\nuc@check{% \ifcase\numexpr0\expandafter\nuc@getlength\nuc@tempa\@nil \or %0 \nuc@onebyteerr\or %1 \nuc@ch@ck{192}\or %2 \nuc@ch@ck{224}\or %3 \nuc@ch@ck{240}\else %4 \nuc@invalidargerr \fi} \def\nuc@ch@ck#1{% \expandafter\ifnum\expandafter`\@tempb<#1\relax \nuc@invalidargerr \else \@tempswatrue \fi } % \end{macrocode} % Finally we disallow |\newunicodechar| outside the preamble. % \begin{macrocode} \@onlypreamble\newunicodechar % \end{macrocode} % \Finale \endinput