% % \iffalse %<*driver> \documentclass{tclldoc} \newcommand{\ParseTcl}{Parse\Tcllogo} \providecommand{\href}[2]{#2} \begin{document} \DocInput{parsetcl.dtx} \end{document} % % \fi % % % \title{\ParseTcl\ --- a parser for \Tcllogo\ code} % \author{Lars Hellstr\"om} % % \maketitle % % % \begin{abstract} % \ParseTcl\ is a \Tcllogo\ package for parsing \Tcllogo\ code. % It is written completely in \Tcllogo, and make heavy use of some % new additions in \Tcllogo\,8.4. % \end{abstract} % % \begin{tcl} %<*pkg> namespace eval parsetcl {} package require Tcl 8.4 package provide parsetcl 0.1 % \end{tcl} % \setnamespace{parsetcl} % % \tableofcontents % % \section{Parser results} % % The values returned by the parser are conceptually rooted trees, but % technically the return values are lists. The general format of a % subtree is % \begin{quote} % \word{type} \word{interval} \word{text} \word{subtree}\regstar % \end{quote} % where there is one \word{subtree} for each component that remains if % the root of the subtree under consideration is deleted. The % \word{type} specifies what kind of construction the subtree % corresponds to. The \word{interval} is a list with the structure % \begin{quote} % \word{first} \word{last} % \end{quote} % where \word{first} and \word{last} are integers; the indices into % the input string of the first and last respectively character that % corresponds to this part of the parser tree. The \word{text} is, % depending on the \word{type}, either empty or the explicit string % that corresponds to this subtree. % % The \word{type}s are generally two-character strings, where the first % letter specifies a major category and the second letter an exact type % within that category, but this system is not enforced by any code. % \begin{description} % \item[Null types] % The \emph{null} types all begin with an |N|. These correspond to % information which isn't part of the script proper, but still is % useful to include in the parse result. It is generally possible % to add or remove subtrees of this type without changing the % meaning of the reconstruction of the script. % \begin{enumerate} % \item[\texttt{Nc}] \describestring[subtree type]{Nc} % This type is for comments, and it normally appears only in % places where one would expect a command subtree. The % \word{interval} includes the |#| beginning the comment, but not % the newline ending it. The \word{text} is the text in the % comment (initial |#| not included). % \item[\texttt{Ne}] \describestring[subtree type]{Ne} % This type is for syntax errors; rather than stopping at every % error, the parser routines will try to recover as best they can, % and an error item will be inserted in the parser tree. This % type of item generally appears as a \word{subtree} of whatever % was being parsed when the error was detected. The \word{text} is % a human-readable error message. The \word{interval} specifies % the erroneous characters; if the error is that some character(s) % is missing then the end of the interval is the index of the % character before that which is missing, and the start of the % interval is the character after that which is missing. % \item[\texttt{Np}] \describestring[subtree type]{Np} % This type is merely a placeholder; both the \word{interval} and % the \word{text} are empty strings. It is used when there was % nothing to return, but call syntax requires that a subtree is % returned. If some call returns an item of this type then it % usually means that its caller also has reached the end of % whatever that was parsing. % \end{enumerate} % % \item[Literate types] % The literate, or verbatim, types all begin with an |L|. Words (of % a command) where no variable or command substitution will happen % usually belong in this category. There are three types of % literates: \describestring[subtree type]{Lb}|Lb|s are delimited by % braces and \describestring[subtree type]{Lq}|Lq|s are delimited by % quotes, whereas \describestring[subtree type]{Lr}|Lr|s have no % particular delimiters. In each case, the \word{text} is the string % that this interval of the input gets parsed into. % % Backslash--newline substitution is silently performed, but other % types of substitution will result in that the item is considered % to consist of one or more parts, each of which will be % represented as a separate \word{subtree}. The delimiting quotes % of an |Lq| is never included in the \word{interval} of any % \word{subtree}. An |Lb| never has any \word{subtree}s due to % substitution, since it can only be subject to backslash--newline % substitution, but it may have |Ne| subtrees. % % \item[Merge types] % The merge types all begin with an |M|. They are similar to the % literate types, but are used for intervals of the input for which % the \emph{text} cannot be completely determined. % \describestring[subtree type]{Mq}|Mq|s are delimited by quotes, % whereas \describestring[subtree type]{Mr}|Mr|s have no particular % delimiters. The \word{text} is set to an empty string. There is % always at least one \word{subtree}. % % \item[Substitution types] % The substitution types all begin with an |S|. As the name % indicates, they are used for intervals of the input that will be % substituted by a \Tcllogo\ parser. % \begin{enumerate} % \item[\texttt{Sb}] \describestring[subtree type]{Sb} % This type is for backslash substitution. The \word{text} is % the character produced by this substitution. There is always % an |Lr| subtree for the text following the initial backslash. % \item[\texttt{Sv}] \describestring[subtree type]{Sv} % This type is for scalar variable substitution. The % \word{text} is empty, and there is one \word{subtree} for the % name of the variable being substituted. The type of this % subtree is either |Lr| or |Lb|, depending on whether the name % is surrounded by braces or not. % \item[\texttt{Sa}] \describestring[subtree type]{Sa} % This type is for array variable substitution. The % \word{text} is empty, and there are two \word{subtree}s: one % for the name of the array (always an |Lr|) and one for the % index into the array (either an |Lr| or an |Mr|). % \item[\texttt{Sc}] \describestring[subtree type]{Sc} % This type is for command substitution. The \word{text} is % empty, and there is one \word{subtree} for each command in % the script. % \end{enumerate} % % \item[The command type] % The \describestring[subtree type]{Cd}|Cd| type is for complete % commands. The \word{text} is empty, and each word in the command % has its own \word{subtree} (naturally, these are in the order of % the words in the command). % % \item[Root types] % The root types are used for the root in a parser tree, but they % may also appear as proper subtrees if the parser is invoked % recursively. This is for example what one does with the body % argument of the |while| command: technically it is a string just % like any other string, but it makes most sense to parse that % string as a script and replace the original literate item by the % entire tree generated by parsing this string. % % The root types all begin with an |R|. They are % \begin{enumerate} % \item[\texttt{Rs}] \describestring[subtree type]{Rs} % This type is for script parsing. The \word{text} is usually % empty, and each \word{subtree} is a command of the script. % \item[\texttt{Rx}] \describestring[subtree type]{Rx} % This type is for |expr| expression parsing. % \end{enumerate} % % \end{description} % % \subsection{Some examples} % % The script % \begin{quote} % |set a "b\nc"| % \end{quote} % will be parsed as the list %\begin{verbatim} % Rs {0 11} {} \ % {Cd {0 11} {} % {Lr {0 2} set} % {Lr {4 4} a} % {Lq {6 11} b\nc % {Lr {7 7} b} % {Sb {8 9} \n {Lr {9 9} n}} % {Lr {10 10} c} % } % } %\end{verbatim} % NB: This is not the canonical string representation, but one that was % chosen for clarity of exposition. The |parsetcl::format_tree| % procedure can be used to reformat a parser tree with indentation as % shown above. % % % \section{Basic script parsing} % % Basic script parsing makes a purely syntactic parsing of a script. It % does not make any assumptions about the meaning of the commands used % in the script, and hence it will not try to parse as scripts any % argument of a commands (not even the byte-compiled commands). % % The parsing relies on a set of procedures which parses the next item % of a particular sort, and returns the corresponding parse subtree. % These procedures can call each other recursively in complicated ways, % following the actual structure of the script that is being parsed. % The general syntax for calling such a procedure is % \begin{quote} % \word{proc name} \word{string} \word{index-var-name} % \word{extra}\regstar % \end{quote} % where \word{string} is the string from which something should be % parsed and \word{index-var-name} is the name of a variable in the % local context of the caller which holds the index of the first % character that hasn't been parsed yet (or possibly the last character % that has been parsed, it may depend on what is most convenient for % that particular procedure). There are two reasons for using this % set-up. One is that the calling procedure usually needs to continue % parsing the string after the part that it made a recursive call to % have parsed, and for that purpose this variable provides a convenient % method of returning the value. The other reason is one of efficiency: % If the script as a whole is passed as an argument to all procedures % that are involved in the parsing process, as opposed to passing only % the part that hasn't been parsed yet, then the need to copy the data % decreases drastically. % % % \subsection{Commands and scripts} % % \begin{proc}{flush_whitespace} % The |flush_whitespace| procedure advances the index past a % whitespace sequence. The syntax is % \begin{quote} % |parsetcl::flush_whitespace| \word{script} \word{index-var-name} % \word{cmdsep} % \end{quote} % where \word{cmdsep} is |1| if command separators (newlines and % semicolons) should be flushed as well, but |0| if they should be % treated separately. The return value is the number of characters % that were flushed. % \begin{tcl} proc parsetcl::flush_whitespace {script index_var cmdsep} { upvar 1 $index_var index if {[ if {$cmdsep} then { regexp -start $index -- {\A([ \t-\r;]|\\\n)+} $script match } else { regexp -start $index -- {\A([ \t\v\f\r]|\\\n)+} $script match } ]} then { incr index [string length $match] return [string length $match] } else { return 0 } } % \end{tcl} % \end{proc} % % \begin{proc}{parse_command} % The |parse_command| procedure parses the next command, returning a % |Cd|, |Nc|, or |Np| item to its caller. The syntax is % \begin{quote} % |parsetcl::parse_command| \word{script} \word{index-var-name} % \word{nested} % \end{quote} % The \word{nested} argument is |1| if the context is that of command % substitution (in which case |]| acts as command terminator), and |0| % otherwise. Upon return, the index variable generally points to the % character that terminates the command. % % The procedure skips past any whitespace or command separators in front % of the actual command. % \begin{tcl} proc parsetcl::parse_command {script index_var nested} { upvar 1 $index_var index % \end{tcl} % The first step is to scan past the whitespace in front of the % command. % \begin{tcl} flush_whitespace $script index 1 % \end{tcl} % Then there are two ``not a command'' cases to take care of: there % might be a comment or the command sequence might have ended. In the % former case, the extent of the comment is determined and a % corresponding |Nc| item is returned. Backslash--newline % substitution adds a bit complexity to the problem of finding the end % of the comment, since an escaped newline may be part of the comment, % although a backslash is only escaping if it isn't itself escaped. % \begin{tcl} switch -- "[string index $script $index]$nested" {#0} - {#1} { regexp -start $index -indices -- {\A#([^\n\\]|\\.)*(\\$)?}\ $script interval incr index regsub -all -- {\\\n[ \t]*}\ [string range $script $index [lindex $interval 1]]\ { } text set index [expr {[lindex $interval 1] + 1}] return [list Nc $interval $text] } 0 - 1 - \]1 { % \end{tcl} % In the latter case, an |Np| item is returned. It happens if % |$index| is past the last character in the string, but also if % |$index| is at a right bracket and the command is \word{nested}. % \begin{tcl} return [list Np "" ""] } % \end{tcl} % But if we get this far, then there is a command to parse. One only % has to parse all the words. % \begin{tcl} set res [list Cd [list $index ""] ""] set next [parse_word $script index $nested] while {[lindex $next 0] ne "Np"} { lappend res $next set next [parse_word $script index $nested] } lset res 1 1 [lindex $res end 1 1] return $res } % \end{tcl} % \end{proc} % % \begin{proc}{basic_parse_script} % The |basic_parse_script| procedure parses a script (that is to say, a % sequence of commands) and returns the corresponding |Rs| item. % The syntax is % \begin{quote} % |parsetcl::basic_parse_script| \word{script} % \end{quote} % \begin{tcl} proc parsetcl::basic_parse_script {script} { set index 0 set res [list Rs [list $index ""] ""] while {[lindex [set next [parse_command $script index 0]] 0] ne "Np"} { lappend res $next } incr index -1 lset res 1 1 $index return $res } % \end{tcl} % \end{proc} % % % \subsection{Parsing words} % % \begin{proc}{parse_word} % The |parse_word| procedure parses the next command word, returning % an |Lb|, |Lq|, |Lr|, |Mq|, |Mr|, or |Np| item to its caller. The % syntax is % \begin{quote} % |parsetcl::parse_word| \word{script} \word{index-var-name} % \word{nested} % \end{quote} % The \word{nested} argument is |1| if the context is that of command % substitution (in which case |]| acts as command terminator), and |0| % otherwise. It is assumed that the index variable points to the first % character of the word when this procedure is called. Upon return, % the index variable points to the first none-whitespace character % (or command separator) after the word that was parsed. % % Most of the actual work is done by the helper procedures % |parse_raw_word|, |parse_quoted_word|, and |parse_braced_word|. % \begin{tcl} proc parsetcl::parse_word {script index_var nested} { upvar 1 $index_var index switch -- [string index $script $index] \{ { parse_braced_word $script index $nested } \" { parse_quoted_word $script index $nested } "" - \; - \n { list Np "" "" } \] { if {$nested} then { list Np "" "" } else { parse_raw_word $script index $nested } } default { parse_raw_word $script index $nested } } % \end{tcl} % \end{proc} % % \begin{proc}{parse_braced_word} % The |parse_braced_word| procedure parses a brace-delimited word and % returns the corresponding |Lb| item to its caller. The syntax is % \begin{quote} % |parsetcl::parse_braced_word| \word{script} \word{index-var-name} % \word{nested} % \end{quote} % where the arguments and call conventions are as for |parse_word|. % In case the braced word is followed by some non-whitespace % characters, then this will be interpreted as though a space was % missing after the brace. A corresponding |Ne| subtree will be % attached to the |Lb| item returned, and the index variable will be % pointed to that none-whitespace character after the brace. % \begin{tcl} proc parsetcl::parse_braced_word {script index_var nested} { upvar 1 $index_var index set res [list Lb [list $index ""]] set depth 1 set text "" incr index while {$depth>0} { % \end{tcl} % Each iteration of this loop takes care of one non-trivial character % combination and whatever piece of trivial material that preceeds % it. Braces which are not escaped obviously constitute non-trivial % material (because they change the nesting depth), but escaped % newlines are also non-trivial (because they are subject to % substitution). % \begin{tcl} regexp -start $index -- {\A([^{}\\]|\\[^\n])*} $script match append text $match incr index [string length $match] switch -- [string index $script $index] \{ { incr depth incr index } \} { incr depth -1 incr index } \\ { if {[regexp -start $index -- {\A\\\n[ \t]*} $script match]}\ then { incr index [string length $match] append text { } } else { % \end{tcl} % In this (rather curious) case, the \word{script} ended with an % escaping backslash, but there is nothing to escape. This is % interpreted as just the backslash. However, there will be an error % since the braces apparently haven't been properly balanced. % \begin{tcl} append text \\ break } } "" { break } } if {$depth>0} then { lset res 1 1 $index lappend res $text [list Ne [list "" $index] {missing close-brace}] lset res 3 1 0 [incr index] return $res } lset res 1 1 [expr {$index - 1}] lappend res $text % \end{tcl} % What remains now is to check that there aren't any stray characters % following the close-brace. If there is whitespace to flush then % everything is alright. Things are also alright if the next character % is a command terminator or we're at the end of the string. % \begin{tcl} if {[flush_whitespace $script index 0]} then {return $res} switch -- [string index $script $index] \n - \; - {} { return $res } \] { if {$nested} then {return $res} } % \end{tcl} % But if that is not the case then there is an error, and an |Ne| item % should be appended to |res|. The exact appearence of this item does % however depend on how the parser attempts to recover from the error, % and that is not immediately obvious. The \Tcllogo\ parser makes no % attempt to recover---and can therefore report the error as ``extra % characters after close-brace''---but the best way to recover rather % seems to be to assume that a space is missing. That is why the % following error message is non-standard. % \begin{tcl} lappend res [list Ne [list $index [expr {$index - 1}]]\ {missing space after close-brace}] return $res } % \end{tcl} % \end{proc} % % \begin{proc}{parse_quoted_word} % The |parse_quoted_word| procedure parses a quote-delimited word and % returns the corresponding |Lq| or |Mq| item to its caller. The % syntax is % \begin{quote} % |parsetcl::parse_quoted_word| \word{script} \word{index-var-name} % \word{nested} % \end{quote} % where the arguments and call conventions are as for |parse_word|. % In case the quoted word is followed by some non-whitespace % characters, then this will be interpreted as though a space was % missing after the last quote. A corresponding |Ne| subtree will % be be put last in the |Lq| or |Mq| item returned, and the index % variable will be pointed to that none-whitespace character after % the quote. % \begin{tcl} proc parsetcl::parse_quoted_word {script index_var nested} { upvar 1 $index_var index set res [list Lq [list $index ""] ""] set text "" incr index while {1} { % \end{tcl} % Each iteration of this loop adds one subtree to |res| and the % corresponding literate material to |text|. If some substitution % happens which does not produce literate material then the |Lq| % in |res| is changed to an |Mq|. % \begin{tcl} switch -- [string index $script $index] \\ { lappend res [parse_backslash $script index] append text [lindex $res end 2] } \$ { lappend res [parse_dollar $script index] lset res 0 Mq } \[ { lappend res [parse_bracket $script index] lset res 0 Mq } \" { incr index break } "" { lappend res [list Ne [list $index [expr {$index - 1}]]\ {missing close-quote}] break } default { regexp -start $index -- {[^\\$\["]*} $script match set t $index incr index [string length $match] lappend res [list Lr [list $t [expr {$index - 1}]] $match] append text $match } } lset res 1 1 [expr {$index - 1}] if {[lindex $res 0] eq "Lq"} then { lset res 2 $text % \end{tcl} % An |Lq| item that has precisely one subtree that furthermore is of % type |Lr| can do without that subtree, since all the interesting % information is in the |text|. % \begin{tcl} if {[llength $res] == 4 && [lindex $res 3 0] eq "Lr"} then { set res [lrange $res 0 2] } } % \end{tcl} % What remains now is to check that there aren't any stray characters % following the close-quote. If there is whitespace to flush then % everything is alright. Things are also alright if the next character % is a command terminator or we're at the end of the string. % \begin{tcl} if {[flush_whitespace $script index 0]} then {return $res} switch -- [string index $script $index] \n - \; - {} { return $res } \] { if {$nested} then {return $res} } % \end{tcl} % But if that is not the case then there is an error, and an |Ne| item % should be appended to |res|. As with characters after close-quotes, % it is assumed that a space is missing. % \begin{tcl} lappend res [list Ne [list $index [expr {$index - 1}]]\ {missing space after close-quote}] return $res } % \end{tcl} % \end{proc} % % \begin{proc}{parse_raw_word} % The |parse_raw_word| procedure parses a whitespace-delimited word % of a command and returns the corresponding |Lr| or |Mr| item to its % caller. The syntax is % \begin{quote} % |parsetcl::parse_raw_word| \word{script} \word{index-var-name} % \word{nested} % \end{quote} % where the arguments and call conventions are as for |parse_word|. % \begin{tcl} proc parsetcl::parse_raw_word {script index_var nested} { upvar 1 $index_var index set res [list] set type Lr set interval [list $index] set text "" while {1} { % \end{tcl} % Each iteration of this loop adds one subtree to |res| and the % corresponding literate material to |text|. If some substitution % happens which does not produce literate material then |type| is set % to an |Mr|. % \begin{tcl} switch -- [string index $script $index] \\ { if {[string index $script [expr {$index+1}]] eq "\n"} then { break } lappend res [parse_backslash $script index] append text [lindex $res end 2] continue } \$ { lappend res [parse_dollar $script index] set type Mr continue } \[ { lappend res [parse_bracket $script index] set type Mr continue } \t - \n - \v - \f - \r - " " - \; - "" { break } if {$nested} then { if {![ regexp -start $index -- {\A[^\\$\[\]\t-\r ;]+} $script match ]} then {break} } else { regexp -start $index -- {\A[^\\$\[\t-\r ;]+} $script match } set t $index incr index [string length $match] lappend res [list Lr [list $t [expr {$index - 1}]] $match] append text $match } % \end{tcl} % In case there is only a single element in |res| then that will be % the result. % \begin{tcl} if {[llength $res]==1} then { set res [lindex $res 0] } else { lappend interval [expr {$index - 1}] if {$type ne "Lr"} then {set text ""} set res [linsert $res 0 $type $interval $text] } flush_whitespace $script index 0 return $res } % \end{tcl} % \end{proc} % % % \subsection{Parsing substitutions} % % \begin{proc}{parse_backslash} % The |parse_backslash| procedure parses a backslash sequence and % returns the corresponding |Sb| item. The syntax is % \begin{quote} % |parsetcl::parse_backslash| \word{script} \word{index-var-name} % \end{quote} % and it is assumed that the index character in \word{script} is the % initial backslash in the sequence. Upon return, the index character % is the first character after the backslash sequence. % \begin{tcl} proc parsetcl::parse_backslash {script index_var} { upvar 1 $index_var index set start $index incr index set ch [string index $script $index] set res [list Lr [list $index $index] $ch] switch -- $ch a { set res [list Sb [list $start $index] \a $res] } b { set res [list Sb [list $start $index] \b $res] } f { set res [list Sb [list $start $index] \f $res] } n { set res [list Sb [list $start $index] \n $res] } r { set res [list Sb [list $start $index] \r $res] } t { set res [list Sb [list $start $index] \t $res] } v { set res [list Sb [list $start $index] \v $res] } x { if {[regexp -start [expr {$index + 1}] -- {\A[0-9A-Fa-f]+}\ $script match]} then { scan [string range $match end-1 end] %x code incr index [string length $match] lset res 1 1 $index lset res 2 "x$match" set res [list Sb [list $start $index]\ [format %c $code] $res] } else { set res [list Sb [list $start $index] x $res] } } u { if {[regexp -start [expr {$index + 1}] -- {\A[0-9A-Fa-f]{1,4}}\ $script match]} then { scan $match %x code incr index [string length $match] lset res 1 1 $index lset res 2 "u$match" set res [list Sb [list $start $index]\ [format %c $code] $res] } else { set res [list Sb [list $start $index] u $res] } } \n { regexp -start [expr {$index + 1}] -- {\A[ \t]*} $script match incr index [string length $match] lset res 1 1 $index lset res 2 "\n$match" set res [list Sb [list $start $index] " " $res] } "" { return [list Sb [list $start $start] \\] } default { if {[regexp -start $index -- {\A[0-7]{1,3}} $script match]} then { scan $match %o code incr index [expr {[string length $match]-1}] lset res 1 1 $index lset res 2 $match set res [list Sb [list $start $index] [format %c $code] $res] } else { set res [list Sb [list $start $index] $ch $res] } } incr index return $res } % \end{tcl} % \end{proc} % % \begin{proc}{parse_bracket} % The |parse_bracket| procedure parses one pair of command % substitution brackets and returns the corresponding |Sc| item. % The syntax is % \begin{quote} % |parsetcl::parse_bracket| \word{script} \word{index-var-name} % \end{quote} % and it is assumed that the index character in \word{script} is the % initial bracket in the sequence. Upon return, the index character % is the first character after the close-bracket. % \begin{tcl} proc parsetcl::parse_bracket {script index_var} { upvar 1 $index_var index set res [list Sc [list $index ""] ""] incr index while {[lindex [set next [parse_command $script index 1]] 0] ne "Np"} { lappend res $next } if {[string index $script $index] eq "\]"} then { lset res 1 1 $index incr index return $res } else { lappend res [list Ne [list $index [expr {$index-1}]]\ {missing close-bracket}] lset res 1 1 [expr {$index-1}] return $res } } % \end{tcl} % \end{proc} % % \begin{variable}{varname_RE} % The |varname_RE| is the regular expression used with |$| % substitution for grabbing the name of the variable. % \begin{tcl} set parsetcl::varname_RE {\A(\w|::)+} % \end{tcl} % The reason for factoring this out into a variable is that % \Tcllogo\ isn't using the natural (Unicode) definition of an % alphanumeric character here, but relies on a locale-dependent % C~library function to perform this test. They generally agree on % ASCII characters, but outside that one probably cannot rely on that % the test produces sensible results. Hence if a script needs to be % parsed where some non-Unicode behaviour of this library function is % significant, then you might need to change the above regexp. % % There is a \Tcllogo\ bug report~\cite{Letter-bug} on the matter, % and hopefully there will be no need to ever change this regexp % once that has been resolved. % \end{variable} % % \begin{proc}{parse_dollar} % The |parse_dollar| procedure parses one |$|-sequence and returns % the corresponding |Sv|, |Sa|, or |Lr| item. The syntax is % \begin{quote} % |parsetcl::parse_dollar| \word{script} \word{index-var-name} % \end{quote} % and it is assumed that the index character in \word{script} is the % initial |$| in the sequence. Upon return, the index character % is the first character after the parsed sequence. % \begin{tcl} proc parsetcl::parse_dollar {script index_var} { upvar 1 $index_var index set res [list "" [list $index ""] ""] incr index % \end{tcl} % If the first character after the |$| is a left brace, then the % variable is scalar and its name is terminated by the next right % brace. Note that braces do not nest in this case. % \begin{tcl} if {[string index $script $index] eq "\{"} then { lset res 0 Sv set end [string first \} $script $index] if {$end<0} then { set end [expr {[string length $script] - 1}] lappend res [list Lb [list $index $end]\ [string range $script [expr {$index + 1}] end]]\ [list Ne [list [expr {$end+1}] $end]\ {missing close-brace for variable name}] } else { lappend res [list Lb [list $index $end]\ [string range $script [expr {$index + 1}] [expr {$end-1}]]] } lset res 1 1 $end set index [expr {$end + 1}] return $res } % \end{tcl} % Otherwise see if there is something which can be interpreted as an % alphanumeric variable name. First treat the case when there isn't; % in that case the |$| is just a literate |$| and it is returned as an % |Lr| iterm. Then treat the case that the variable is scalar. % \begin{tcl} variable varname_RE if {![regexp -start $index -- $varname_RE $script match]} then { if {[string index $script $index] eq "("} then { set match "" } else { return [list Lr [list [lindex $res 1 0] [lindex $res 1 0]] \$] } } set t $index incr index [string length $match] lappend res [list Lr [list $t [expr {$index-1}]] $match] if {[string index $script $index] ne "("} then { lset res 0 Sv lset res 1 1 [lindex $res 3 1 1] return $res } % \end{tcl} % What remains is to treat the case of an array variable. This is % very much like |parse_quoted_word|, but it is the right parenthesis % rather than |"| that acts as terminator. % \begin{tcl} lset res 0 Sa incr index set subres [list Lr [list $index ""] ""] lappend res "" set text "" while {1} { % \end{tcl} % Each iteration of this loop adds one subtree to |subres| and the % corresponding literate material to |text|. If some substitution % happens which does not produce literate material then the |Lr| % in |subres| is changed to an |Mr|. % \begin{tcl} switch -- [string index $script $index] \\ { lappend subres [parse_backslash $script index] append text [lindex $subres end 2] } \$ { lappend subres [parse_dollar $script index] lset subres 0 Mr } \[ { lappend subres [parse_bracket $script index] lset subres 0 Mr } ) { lset subres 1 1 [expr {$index - 1}] break } "" { lappend res\ [list Ne [list $index [incr index -1]] {missing )}] lset subres 1 1 $index break } default { regexp -start $index -- {[^\\$\[)]*} $script match set t $index incr index [string length $match] lappend subres [list Lr [list $t [expr {$index - 1}]] $match] append text $match } } if {[lindex $subres 0] eq "Lr"} then {lset subres 2 $text} if {[llength $subres] == 4} then {set subres [lindex $subres 3]} lset res 1 1 $index incr index lset res 4 $subres return $res } % \end{tcl} % \end{proc} % % % % \section{Some utility procdures} % % \subsection{Viewing parser trees} % % It is useful to have a surveyable presentation of parser trees. An % easy step towards this would be to ensure that nesting depth is % mirrored in the indentation. % % \begin{proc}{format_tree} % The |format_tree| procedure takes a parser tree, a base indentation % (string of whitespace), and an indentation step (string of whitespace) % as argument. It returns a string that is list-wise equivalent to % |[list]| of the original tree, but has indentation mirroring the % nesting depth. % \begin{tcl} proc parsetcl::format_tree {tree base step} { set res $base append res \{ [lrange $tree 0 1] { } % \end{tcl} % The following is a trick to make a list element string % representation for the \word{text} that does not contain any sort % of newline: making the element unbalanced with respect to braces % forces escape-quoting also for newlines. % \begin{tcl} if {[regexp {[\n\r]} [lindex $tree 2]]} then { append res [string range [list "[lindex $tree 2]\{"] 0 end-2] } else { append res [lrange $tree 2 2] } if {[llength $tree]<=3} then { append res \} return $res } elseif {[llength $tree] == 4 &&\ [string match {S[bv]} [lindex $tree 0]]} then { % \end{tcl} % This is a slight optimization: |Sb| and |Sv| trees look better on % one line than on three. % \begin{tcl} append res " " [format_tree [lindex $tree 3] "" ""] \} return $res } append res \n foreach subtree [lrange $tree 3 end] { append res [format_tree $subtree $base$step $step] \n } append res $base \} } % \end{tcl} % \end{proc} % % % \subsection{Offsetting intervals} % % \begin{proc}{offset_intervals} % This procedure modifies a \word{tree} by adding \word{offset} to % all endpoints of each interval in that tree, and returns the % modified tree. The syntax is % \begin{quote} % |parsetcl::offset_intervals| \word{tree} \word{offset} % \end{quote} % \begin{tcl} proc parsetcl::offset_intervals {tree offset} { set res [lrange $tree 0 2] foreach i {0 1} { lset res 1 $i [expr {[lindex $res 1 $i] + $offset}] } foreach subtree [lrange $tree 3 end] { lappend res [offset_intervals $subtree $offset] } return $res } % \end{tcl} % \end{proc} % % \begin{proc}{reparse_Lb_as_script} % The |reparse_Lb_as_script| procedure replaces an |Lb| node from an % existing parser tree with the |Rs| node produced by parsing the text % between the braces as a script. The syntax is % \begin{quote} % |parsetcl::reparse_Lb_as_script| \word{tree-var} \word{node-index} % \word{parsed string} % \end{quote} % where \word{tree-var} is the name in the caller's local context of % the variable in which the tree to substitute is stored and % \word{node-index} is the index list of the node to replace. The % procedure returns |2| if the node was an |Lb| node, |1| if is was % an |Lr| or |Lq| node, and |0| if the node was not of any of these % types. In the last case, the tree is not modified. In the case that % |1| is returned, the node has been substituted, but the intervals % may be slightly off since the string to parse was taken from the % \word{text} part of the substituted node. The normal case is that |2| % is returned, and in this case the intervals unambiguously refer to % positions in the \word{parsed string}. % \begin{tcl} proc parsetcl::reparse_Lb_as_script {tree_var index parsed} { upvar 1 $tree_var tree set node [lindex $tree $index] switch -- [lindex $node 0] Lb - Lr - Lq { set base [expr {[lindex $node 1 0] + 1}] if {[lindex $node 0] eq "Lb"} then { set script [string range $parsed $base\ [expr {[lindex $node 1 1] - 1}]] } else { set script [lindex $node 2] } lset tree $index\ [offset_intervals [basic_parse_script $script] $base] if {[lindex $node 0] eq "Lb"} then { return 2 } else { return 1 } } default { return 0 } } % \end{tcl} % \end{proc} % % % \subsection{Traversing parser trees} % % \begin{proc}{walk_tree} % The |walk_tree| procedure has the syntax % \begin{quote} % |parsetcl::walk_tree| \word{tree-var} \word{index-var} % \begin{regblock}[\regplus]\word{type-pattern} % \word{body}\end{regblock} % \end{quote} % It walks through each node in the tree stored in the % \word{tree-var} in the order that they appear in the string, i.e., % first the root of a tree, then it walks through each subtree in % sequence. When visiting a node, the type of the node is % regexp-matched against the \word{type-pattern}s, and the first % matching \word{body} is evaluated (entirely using |switch -regexp|) % in the local context of the caller. While walking, the procedure is % updating the \word{index-var} variable so that it always contains % the index list specifying the current node in the tree. % % It is OK to modify the tree while walking through it, provided that % the index list of the current node remains valid throughout. It is % particular possible to change the contents of the current node (add % or remove children) without messing things up. % % There is no particular return value from this procedure. % \begin{tcl} proc parsetcl::walk_tree {tree_var index_var args} { upvar 1 $tree_var tree $index_var idxL set idxL [list] set i 0 while {$i>=0} { if {$i==0} then { uplevel 1 [list switch -regexp --\ [lindex [lindex $tree $idxL] 0] $args] set i 3 } elseif {$i < [llength [lindex $tree $idxL]]} then { lappend idxL $i set i 0 } elseif {[llength $idxL]} then { set i [lindex $idxL end] set idxL [lrange $idxL 0 end-1] incr i } else { set i -1 } } } % \end{tcl} % \end{proc} % % \begin{proc}{simple_parse_script} % The |simple_parse_script| procedure is similar to % |basic_parse_script|, but it tries to recognise some common control % structures (|if|, |for|, etc.) and reparses those arguments that % (for standard command definitions) are scripts. The syntax is % \begin{quote} % |parsetcl::simple_parse_script| \word{script} % \end{quote} % and it returns the parsed script. % \begin{tcl} proc parsetcl::simple_parse_script {script} { set tree [parsetcl::basic_parse_script $script] walk_tree tree indices Cd { switch -- [lindex [lindex $tree $indices] 3 2] if { for {set i 3} {$i < [llength [lindex $tree $indices]]}\ {incr i} { switch -- [lindex [lindex $tree $indices] $i 2]\ if - elseif { incr i; continue } then - else { incr i } parsetcl::reparse_Lb_as_script tree\ [linsert $indices end $i] $script } } while { parsetcl::reparse_Lb_as_script tree [linsert $indices end 5]\ $script } for { parsetcl::reparse_Lb_as_script tree [linsert $indices end 4]\ $script parsetcl::reparse_Lb_as_script tree [linsert $indices end 6]\ $script parsetcl::reparse_Lb_as_script tree [linsert $indices end 7]\ $script } foreach { parsetcl::reparse_Lb_as_script tree [linsert $indices end end]\ $script } catch { parsetcl::reparse_Lb_as_script tree [linsert $indices end 4]\ $script } proc { parsetcl::reparse_Lb_as_script tree [linsert $indices end 6]\ $script } } return $tree } % \end{tcl} % \end{proc} % % \section{Parsing expressions} % % This hasn't been implemented yet. % % % \section{Advanced script parsing} % % Most of the meaning of a \Tcllogo\ script depends on its commands % rather than being given by the syntax---there is e.g. nothing in the % general syntax that makes the last argument of a |foreach| command % more likely to contain a script than any of the other arguments---and % therefore any higher level parsing of \Tcllogo\ scripts must employ a % table of commands % % This hasn't been implemented yet. % % % \section{Script reconstruction} % % This hasn't been implemented yet, but the parser trees contain % the necessary information. Applications of script reconstruction % includes writing a |proc|-like command that inlines code in the body % argument before the procedure is created. % % % \begin{thebibliography}{9} % \bibitem{Letter-bug} % Kevin B. Kenny and Jeffrey Hobbs: % \textit{Dollar-substitution and non-Latin-1}, % \Tcllogo~project bug \#408568 (2001, still open April~2003); % \href{https://sourceforge.net/tracker/^^A % ?func=detail&aid=408568&group_id=10894&atid=110894}^^A % {\textsc{https}:/\slash \texttt{sourceforge.net}\slash % \texttt{tracker}\slash % \texttt{?func=detail\&}\penalty\exhyphenpenalty % \texttt{aid=408568\&}\penalty\exhyphenpenalty % \texttt{group\_id=10894\&}\penalty\exhyphenpenalty % \texttt{atid=110894}.} % % \end{thebibliography} % \endinput