/* scan.c -- scanning Info files and nodes Copyright 1993-2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Originally written by Brian Fox. */ #include "info.h" #include "session.h" #include "scan.h" #include "util.h" #include "tag.h" #include #if HAVE_ICONV # include #endif #include #ifdef __MINGW32__ /* MinGW uses a replacement nl_langinfo, see pcterm.c. */ # define nl_langinfo rpl_nl_langinfo extern char * rpl_nl_langinfo (nl_item); /* MinGW uses its own replacement wcwidth, see pcterm.c for the reasons. Since Gnulib's wchar.h might redirect wcwidth to rpl_wcwidth, we explicitly undo that here. */ #undef wcwidth #endif #ifdef __hpux #define va_copy(ap1,ap2) memcpy((&ap1),(&ap2),sizeof(va_list)) #endif /* Variable which holds the most recent filename parsed as a result of calling info_parse_xxx (). */ char *info_parsed_filename = NULL; /* Variable which holds the most recent nodename parsed as a result of calling info_parse_xxx (). */ char *info_parsed_nodename = NULL; /* Read a filename surrounded by "(" and ")", accounting for matching characters, and place it in *FILENAME if FILENAME is not null. Return length of read filename. On error, set *FILENAME to null and return 0. */ int read_bracketed_filename (char *string, char **filename) { register int i = 0; int count = 0; /* Level of nesting. */ int first_close = -1; /* First ")" encountered. */ if (*string != '(') return 0; string++; count = 1; for (i = 0; string[i]; i++) { if (string[i] == '(') count++; else if (string[i] == ')') { if (first_close == -1) first_close = i; count--; if (count == 0) break; } } /* If string ended before brackets were balanced, take the first ")" as terminating the filename. */ if (count > 0) { if (first_close == -1) { if (filename) *filename = 0; return 0; } i = first_close; } if (filename) { *filename = xcalloc (1, i + 1); memcpy (*filename, string, i); } return i + 2; /* Length of filename plus "(" and ")". */ } /* Parse the filename and nodename out of STRING, saving in INFO_PARSED_FILENAME and INFO_PARSED_NODENAME. These variables should not be freed by calling code. If either is missing, the relevant variable is set to a null pointer. */ void info_parse_node (char *string) { int nodename_len; free (info_parsed_filename); free (info_parsed_nodename); info_parsed_filename = 0; info_parsed_nodename = 0; /* Special case of nothing passed. Return nothing. */ if (!string || !*string) return; string += skip_whitespace_and_newlines (string); string += read_bracketed_filename (string, &info_parsed_filename); /* Parse out nodename. */ string += skip_whitespace_and_newlines (string); nodename_len = read_quoted_string (string, "", 0, &info_parsed_nodename); if (nodename_len != 0) { canonicalize_whitespace (info_parsed_nodename); } } /* Set *OUTPUT to a copy of the string starting at START and finishing at a character in TERMINATOR, unless START[0] == INFO_QUOTE, in which case copy string from START+1 until the next occurence of INFO_QUOTE. If TERMINATOR is an empty string, finish at a null character. LINES is the number of lines that the string can span. If LINES is zero, there is no limit. Return length of string including any quoting characters. Return 0 if input was invalid. */ long read_quoted_string (char *start, char *terminator, int lines, char **output) { long len; char *nl = 0, saved_char; if (lines) { int i; nl = start; for (i = 0; i < lines; i++) { nl = strchr (nl, '\n'); if (!nl) break; /* End of input string reached. */ nl++; } if (nl) { saved_char = *nl; *nl = '\0'; } } if (start[0] != '\177') { len = strcspn (start, terminator); if (*terminator && !start[len]) { len = 0; *output = 0; } else { *output = xmalloc (len + 1); strncpy (*output, start, len); (*output)[len] = '\0'; } } else { len = strcspn (start + 1, "\177"); if (*terminator && !(start + 1)[len]) { /* No closing 177 byte. */ len = 0; *output = 0; } else { *output = xmalloc (len + 1); strncpy (*output, start + 1, len); (*output)[len] = '\0'; len += 2; /* Count the two 177 bytes. */ } } if (nl) *nl = saved_char; return len; } /* **************************************************************** */ /* */ /* Finding and Building Menus */ /* */ /* **************************************************************** */ /* Get the entry associated with LABEL in the menu of NODE. Return a pointer to the ENTRY if found, or null. Return value should not be freed by caller. If SLOPPY, allow initial matches, like "Buffers" for a LABEL "buffer". */ REFERENCE * info_get_menu_entry_by_label (NODE *node, char *label, int sloppy) { register int i; int best_guess = -1; REFERENCE *entry; REFERENCE **references = node->references; if (!references) return 0; for (i = 0; (entry = references[i]); i++) { if (entry->type != REFERENCE_MENU_ITEM) continue; if (mbscasecmp (label, entry->label) == 0) return entry; /* Exact, case-insensitive match. */ else if (sloppy && best_guess == -1 && (mbsncasecmp (entry->label, label, strlen (label)) == 0)) best_guess = i; } if (sloppy && best_guess != -1) return references[best_guess]; return 0; } /* A utility function for concatenating REFERENCE **. Returns a new REFERENCE ** which is the concatenation of REF1 and REF2. */ REFERENCE ** info_concatenate_references (REFERENCE **ref1, REFERENCE **ref2) { register int i, j; REFERENCE **result; int size = 0; /* Get the total size of the slots that we will need. */ if (ref1) { for (i = 0; ref1[i]; i++); size += i; } if (ref2) { for (i = 0; ref2[i]; i++); size += i; } result = xmalloc ((1 + size) * sizeof (REFERENCE *)); /* Copy the contents over. */ j = 0; if (ref1) { for (i = 0; ref1[i]; i++) result[j++] = ref1[i]; } if (ref2) { for (i = 0; ref2[i]; i++) result[j++] = ref2[i]; } result[j] = NULL; return result; } /* Copy a reference structure. Copy each field into new memory. */ REFERENCE * info_copy_reference (REFERENCE *src) { REFERENCE *dest = xmalloc (sizeof (REFERENCE)); dest->label = src->label ? xstrdup (src->label) : NULL; dest->filename = src->filename ? xstrdup (src->filename) : NULL; dest->nodename = src->nodename ? xstrdup (src->nodename) : NULL; dest->start = src->start; dest->end = src->end; dest->line_number = src->line_number; dest->type = src->type; return dest; } /* Copy a list of references, copying in reference in turn with info_copy_reference. */ REFERENCE ** info_copy_references (REFERENCE **ref1) { int i; REFERENCE **result; int size; if (!ref1) return 0; /* Get the total size of the slots that we will need. */ for (i = 0; ref1[i]; i++); size = i; result = xmalloc ((1 + size) * sizeof (REFERENCE *)); /* Copy the contents over. */ for (i = 0; ref1[i]; i++) result[i] = info_copy_reference (ref1[i]); result[i] = NULL; return result; } void info_reference_free (REFERENCE *ref) { if (ref) { free (ref->label); free (ref->filename); free (ref->nodename); free (ref); } } /* Free the data associated with REFERENCES. */ void info_free_references (REFERENCE **references) { register int i; REFERENCE *entry; if (references) { for (i = 0; references && (entry = references[i]); i++) info_reference_free (entry); free (references); } } /* Return new REFERENCE with filename and nodename fields set. */ REFERENCE * info_new_reference (char *filename, char *nodename) { REFERENCE *r = xmalloc (sizeof (REFERENCE)); r->label = 0; r->filename = filename ? xstrdup (filename) : 0; r->nodename = nodename ? xstrdup (nodename) : 0; r->start = 0; r->end = 0; r->line_number = 0; r->type = 0; return r; } /* Search for sequences of whitespace or newlines in STRING, replacing all such sequences with just a single space. Remove whitespace from start and end of string. */ void canonicalize_whitespace (char *string) { register int i, j; int len, whitespace_found, whitespace_loc = 0; char *temp; if (!string) return; len = strlen (string); temp = xmalloc (1 + len); /* Search for sequences of whitespace or newlines. Replace all such sequences in the string with just a single space. */ whitespace_found = 0; for (i = 0, j = 0; string[i]; i++) { if (whitespace_or_newline (string[i])) { whitespace_found++; whitespace_loc = i; continue; } else { if (whitespace_found && whitespace_loc) { whitespace_found = 0; /* Suppress whitespace at start of string. */ if (j) temp[j++] = ' '; } temp[j++] = string[i]; } } /* Kill trailing whitespace. */ if (j && whitespace (temp[j - 1])) j--; temp[j] = '\0'; strcpy (string, temp); free (temp); } /* **************************************************************** */ /* */ /* Scanning node */ /* */ /* **************************************************************** */ /* Whether to strip syntax from the text of nodes. */ int preprocess_nodes_p; /* Whether contents of nodes should be rewritten. */ static int rewrite_p; /* inptr is moved forward through the body of a node. */ static char *inptr; /* Pointer to first byte of node (after node separator). */ static char *input_start; /* Number of bytes in node contents. */ static size_t input_length; struct text_buffer output_buf; /* Pointer into a tags table for the file to the anchor we need to adjust as a result of byte counts changing due to character encoding conversion or inserted/deleted text. */ static TAG **anchor_to_adjust; /* Offset within file buffer of first byte of node, used for anchor adjustment. */ static int node_offset; /* Difference so far between the number of bytes input in the file and bytes output. Used to adjust the values of anchors in nodes. */ static long int output_bytes_difference; /* Whether we are converting the character encoding of the file. */ static int convert_encoding_p; #if HAVE_ICONV /* Whether text in file is encoded in UTF-8. */ static int file_is_in_utf8; /* Used for conversion from file encoding to output encoding. */ static iconv_t iconv_to_output; /* Conversion from file encoding to UTF-8. */ static iconv_t iconv_to_utf8; #endif /* HAVE_ICONV */ void init_conversion (FILE_BUFFER *fb) { char *target_encoding; convert_encoding_p = 0; /* Node being processed does not come from an Info file. */ if (!fb) return; #if !HAVE_ICONV return; #else file_is_in_utf8 = 0; /* Don't process file if encoding is unknown. */ if (!fb->encoding) return; /* Read name of character encoding from environment locale */ target_encoding = nl_langinfo (CODESET); /* Don't convert the contents if the locale uses the same character encoding as the file */ if (!strcasecmp(target_encoding, fb->encoding)) return; /* Check if an iconv conversion from file locale to system locale exists */ iconv_to_output = iconv_open (target_encoding, fb->encoding); if (iconv_to_output == (iconv_t) -1) return; /* Return if no conversion function implemented */ if ( !strcasecmp ("UTF8", fb->encoding) || !strcasecmp ("UTF-8", fb->encoding)) file_is_in_utf8 = 1; if (!file_is_in_utf8) { iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding); if (iconv_to_utf8 == (iconv_t) -1) { /* Return if no conversion function implemented */ iconv_close (iconv_to_output); iconv_to_output = (iconv_t) -1; return; } } convert_encoding_p = 1; rewrite_p = 1; #endif /* HAVE_ICONV */ } void close_conversion (void) { #if HAVE_ICONV if (convert_encoding_p) { iconv_close (iconv_to_output); iconv_to_output = (iconv_t) -1; if (!file_is_in_utf8) iconv_close (iconv_to_utf8); } #endif } static void init_output_stream (FILE_BUFFER *fb) { init_conversion (fb); output_bytes_difference = 0; if (rewrite_p) text_buffer_init (&output_buf); } static size_t saved_offset; static char *saved_inptr; static long saved_difference; void save_conversion_state (void) { saved_offset = text_buffer_off (&output_buf); saved_inptr = inptr; saved_difference = output_bytes_difference; } /* Go back to the saved state of the output stream. */ void reset_conversion (void) { text_buffer_off (&output_buf) = saved_offset; inptr = saved_inptr; output_bytes_difference = saved_difference; } /* Copy bytes from input to output with no encoding conversion. */ static void copy_direct (long n) { text_buffer_add_string (&output_buf, inptr, n); inptr += n; } /* Read one character at *FROM and write out a sequence of bytes representing that character in ASCII. *FROM is advanced past the read character. */ static int degrade_utf8 (char **from, size_t *from_left) { static struct encoding_replacement { char *from_string; char *to_string; } er[] = { {"\xE2\x80\x98","'"}, /* Opening single quote */ {"\xE2\x80\x99","'"}, /* Closing single quote */ {"\xE2\x80\x9C","\""},/* Opening double quote */ {"\xE2\x80\x9D","\""},/* Closing double quote */ {"\xC2\xA9","(C)"}, /* Copyright symbol */ {"\xC2\xBB",">>"}, /* Closing double angle brackets */ {"\xE2\x86\x92","->"},/* Right arrow */ {"\xE2\x87\x92","=>"},/* Right double arrow */ {"\xE2\x8A\xA3","-|"},/* Print symbol */ {"\xE2\x98\x85","-!-"}, /* Point symbol */ {"\xE2\x86\xA6","==>"}, /* Expansion symbol */ {"\xE2\x80\x90","-"}, /* Hyphen */ {"\xE2\x80\x91","-"}, /* Non-breaking hyphen */ {"\xE2\x80\x92","-"}, /* Figure dash */ {"\xE2\x80\x93","-"}, /* En dash */ {"\xE2\x80\x94","--"}, /* Em dash */ {"\xE2\x88\x92","-"}, /* Minus sign */ {"\xE2\x80\xA6","..."}, /* Ellipsis */ {"\xE2\x80\xA2","*"}, /* Bullet */ {"\xC3\xA0","a`"}, /* Lower case letter a with grave accent */ {"\xC3\xA2","a^"}, /* Lower case letter a with circumflex */ {"\xC3\xA4","a\""}, /* Lower case letter a with diaeresis */ {"\xC3\xA6","ae"}, /* Lower case letter ae ligature */ {"\xC3\xA9","e'"}, /* Lower case letter e with acute accent */ {"\xC3\xA8","e`"}, /* Lower case letter e with grave accent */ {"\xC3\xAA","e^"}, /* Lower case letter e with circumflex */ {"\xC3\xAB","e\""}, /* Lower case letter e with diaeresis */ {"\xC3\xB6","o\""}, /* Lower case letter o with diaeresis */ {"\xC3\xBC","u\""}, /* Lower case letter u with diaeresis */ {"\xC3\x84", "A\""}, /* Upper case letter A with diaeresis. */ {"\xC3\x96", "O\""}, /* Upper case letter O with diaeresis. */ {"\xC3\x9c", "U\""}, /* Upper case letter U with diaeresis. */ {"\xC3\xB1","n~"}, /* Lower case letter n with tilde */ {"\xC3\x87","C,"}, /* Upper case letter C with cedilla */ {"\xC3\xA7","c,"}, /* Lower case letter c with cedilla */ {"\xC3\x9f","ss"}, /* Lower case letter sharp s */ {0, 0} }; struct encoding_replacement *erp; for (erp = er; erp->from_string != 0; erp++) { /* Avoid reading past end of input. */ int width = strlen (erp->from_string); if (width > *from_left) continue; if (!strncmp (erp->from_string, *from, width)) { text_buffer_add_string (&output_buf, erp->to_string, strlen(erp->to_string)); *from += width; *from_left -= width; return 1; } } /* Failing this, just print a question mark. Maybe we should use SUB (^Z) (ASCII substitute character code) instead, or pass through the original bytes. */ text_buffer_add_string (&output_buf, "?", 1); /* Ideally we would advance one UTF-8 character. This would require knowing its length in bytes. */ (*from)++; (*from_left)--; return 0; } /* Convert N bytes from input to output encoding and write to output buffer. Return number of bytes over N written. */ static int copy_converting (long n) { #if !HAVE_ICONV return 0; #else size_t bytes_left, orig_bytes_left; int extra_at_end; size_t iconv_ret; long output_start; size_t utf8_char_free; char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */ char *utf8_char_ptr, *orig_inptr; size_t i; /* Use n as an estimate of how many bytes will be required in target encoding. */ text_buffer_alloc (&output_buf, (size_t) n); output_start = text_buffer_off (&output_buf); bytes_left = n; extra_at_end = 0; while (1) { iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output, (ICONV_CONST char **)&inptr, &bytes_left); /* Make sure libiconv flushes out the last converted character. This is required when the conversion is stateful, in which case libiconv might not output the last character, waiting to see whether it should be combined with the next one. */ if (iconv_ret != (size_t) -1 && text_buffer_iconv (&output_buf, iconv_to_output, NULL, NULL) != (size_t) -1) /* Success: all of input converted. */ break; /* There's been an error while converting. */ switch (errno) { case EINVAL: /* Incomplete byte sequence at end of input buffer. Try to read more. */ /* input_length - 2 is offset of last-but-one byte within input. This checks if there is at least one more byte within node contents. */ if (inptr - input_start + (bytes_left - 1) <= input_length - 2) { bytes_left++; extra_at_end++; } else { copy_direct (bytes_left); bytes_left = 0; } continue; default: /* Unknown error */ info_error (_("Error converting file character encoding")); /* Skip past current input and hope we don't get an error next time. */ inptr += bytes_left; return 0; case EILSEQ: /* Byte sequence in input not recognized. Degrade to ASCII. */ break; } /* Flush any waiting input in iconv_to_output and enter the default shift state. */ text_buffer_iconv (&output_buf, iconv_to_output, NULL, NULL); if (file_is_in_utf8) { degrade_utf8 (&inptr, &bytes_left); continue; } /* If file is not in UTF-8, we degrade to ASCII in two steps: first convert the character to UTF-8, then look up a replacement string. Note that mixing iconv_to_output and iconv_to_utf8 on the same input may not work well if the input encoding is stateful. We could deal with this by always converting to UTF-8 first; then we could mix conversions on the UTF-8 stream. */ /* We want to read exactly one character. Do this by restricting size of output buffer. */ utf8_char_ptr = utf8_char; orig_inptr = inptr; orig_bytes_left = bytes_left; for (i = 1; i <= 4; i++) { utf8_char_free = i; errno = 0; iconv_ret = iconv (iconv_to_utf8, (ICONV_CONST char **)&inptr, &bytes_left, &utf8_char_ptr, &utf8_char_free); if ((iconv_ret == (size_t) -1 && errno != E2BIG) /* If we managed to convert a character: */ || utf8_char_ptr > utf8_char) break; } /* errno == E2BIG if iconv ran out of output buffer, which is expected. */ if (iconv_ret == (size_t) -1 && errno != E2BIG) { /* Character is not recognized. Copy a single byte. */ inptr = orig_inptr; /* iconv might have incremented inptr */ copy_direct (1); bytes_left = orig_bytes_left - 1; } else { utf8_char_ptr = utf8_char; /* i is width of UTF-8 character */ degrade_utf8 (&utf8_char_ptr, &i); /* If we are done, make sure iconv flushes the last character. */ if (bytes_left <= 0) { utf8_char_ptr = utf8_char; i = 4; iconv (iconv_to_utf8, NULL, NULL, &utf8_char_ptr, &utf8_char_free); if (utf8_char_ptr > utf8_char) { utf8_char_ptr = utf8_char; degrade_utf8 (&utf8_char_ptr, &i); } } } } /* Must cast because the difference between unsigned size_t is always positive. */ output_bytes_difference += n - ((signed long) text_buffer_off (&output_buf) - output_start); return extra_at_end; #endif /* HAVE_ICONV */ } /* Functions below are named from the perspective of the preprocess_nodes_p flag being on. */ /* Copy text from input node contents, possibly converting the character encoding and adjusting anchor offsets at the same time. */ static void copy_input_to_output (long n) { if (rewrite_p) { long bytes_left; bytes_left = n; while (bytes_left > 0) { if (!convert_encoding_p) { copy_direct (bytes_left); bytes_left = 0; } else { long bytes_to_convert; long extra_written; bytes_to_convert = bytes_left; if (anchor_to_adjust) { /* Check there is an anchor in the input. */ long first_anchor = (*anchor_to_adjust)->nodestart - node_offset; if (first_anchor < 0) anchor_to_adjust = 0; /* error in input file */ else if (first_anchor < (inptr-input_start) + bytes_left) { /* Convert enough to pass the first anchor in input. */ bytes_to_convert = first_anchor - (inptr-input_start)+1; if (bytes_to_convert < 0) { bytes_to_convert = bytes_left; anchor_to_adjust = 0; } } } /* copy_converting may read more than bytes_to_convert bytes if its input ends in an incomplete byte sequence. */ extra_written = copy_converting (bytes_to_convert); bytes_left -= bytes_to_convert + extra_written; } /* Check if we have gone past any anchors and adjust with output_bytes_difference. */ if (anchor_to_adjust) while ((*anchor_to_adjust)->nodestart - node_offset <= inptr - input_start) { (*anchor_to_adjust)->nodestart_adjusted = (*anchor_to_adjust)->nodestart - output_bytes_difference; anchor_to_adjust++; if (!*anchor_to_adjust || (*anchor_to_adjust)->cache.nodelen != 0) { anchor_to_adjust = 0; break; } } } } else inptr += n; } static void skip_input (long n) { if (preprocess_nodes_p) { inptr += n; output_bytes_difference += n; } else if (rewrite_p) { /* We are expanding tags only. Do not skip input. */ copy_input_to_output (n); } else { inptr += n; } } static void write_extra_bytes_to_output (char *input, long n) { if (preprocess_nodes_p) { text_buffer_add_string (&output_buf, input, n); output_bytes_difference -= n; } } /* Like write_extra_bytes_to_output, but writes bytes even when preprocess_nodes=Off. */ static void write_tag_contents (char *input, long n) { if (rewrite_p) { text_buffer_add_string (&output_buf, input, n); output_bytes_difference -= n; } } /* Like skip_input, but skip even when !preprocess_nodes_p. */ static void skip_tag_contents (long n) { if (rewrite_p) { inptr += n; output_bytes_difference += n; } } /* Read first line of node and set next, prev and up. */ static void parse_top_node_line (NODE *node) { char **store_in = 0; char *nodename; char *ptr; int value_length; /* If the first line is empty, leave it in. This is the case in the index-apropos window. */ if (*node->contents == '\n') return; node->next = node->prev = node->up = 0; ptr = node->contents; while (1) { store_in = 0; ptr += skip_whitespace (ptr); /* Check what field we are looking at */ if (!strncasecmp (ptr, INFO_FILE_LABEL, strlen(INFO_FILE_LABEL))) { ptr += strlen (INFO_FILE_LABEL); } else if (!strncasecmp (ptr, INFO_NODE_LABEL, strlen(INFO_NODE_LABEL))) { ptr += strlen (INFO_NODE_LABEL); } else if (!strncasecmp (ptr, INFO_PREV_LABEL, strlen(INFO_PREV_LABEL))) { ptr += strlen (INFO_PREV_LABEL); store_in = &node->prev; } else if (!strncasecmp (ptr, INFO_ALTPREV_LABEL, strlen(INFO_ALTPREV_LABEL))) { ptr += strlen (INFO_ALTPREV_LABEL); store_in = &node->prev; } else if (!strncasecmp (ptr, INFO_NEXT_LABEL, strlen(INFO_NEXT_LABEL))) { ptr += strlen (INFO_NEXT_LABEL); store_in = &node->next; } else if (!strncasecmp (ptr, INFO_UP_LABEL, strlen(INFO_UP_LABEL))) { ptr += strlen (INFO_UP_LABEL); store_in = &node->up; } else { store_in = 0; /* Not recognized - code below will skip to next comma */ } ptr += skip_whitespace (ptr); /* Get length of a bracketed filename component. */ if (*ptr != '(') value_length = 0; else value_length = read_bracketed_filename (ptr, 0); /* Get length of node name, or filename if following "File:". Note that . is not included in the second argument here in order to support this character in file names. */ value_length += read_quoted_string (ptr + value_length, "\n\r\t,", 1, &nodename); if (store_in) { *store_in = xmalloc (value_length + 1); strncpy (*store_in, ptr, value_length); (*store_in)[value_length] = '\0'; } free (nodename); ptr += value_length; if (*ptr == '\n' || !*ptr) break; ptr += 1; /* Point after field terminator */ } } /* Output, replace or hide text introducing a reference. INPTR starts on the first byte of a sequence introducing a reference and finishes on the first (non-whitespace) byte of the reference label. */ static int scan_reference_marker (REFERENCE *entry, int in_parentheses) { /* When preprocess_nodes is Off, we position the cursor on the "*" when moving between references. */ if (!preprocess_nodes_p) { if (rewrite_p) entry->start = text_buffer_off(&output_buf); else entry->start = inptr - input_start; } /* Check what we found based on first character of match */ if (inptr[0] == '\n') { entry->type = REFERENCE_MENU_ITEM; if (!preprocess_nodes_p) entry->start++; } else entry->type = REFERENCE_XREF; if (entry->type == REFERENCE_MENU_ITEM) copy_input_to_output (strlen ("\n* ")); else { /* Only match "*Note" if it is followed by a whitespace character so that it will not be recognized if, e.g., it is surrounded in inverted commas. */ if (!strchr (" \t\r\n", inptr[strlen ("*Note")])) { copy_input_to_output (strlen ("*Note:")); return 0; } /* Cross-references can be generated by four different Texinfo commands. @inforef and @xref output "*Note " in Info format, and "See" in HTML and print. @ref and @pxref output "*note " in Info format, and either nothing at all or "see" in HTML and print. Unfortunately, there is no easy way to distinguish between these latter two cases. */ /* TODO: Internationalize these strings, but only if we know the language of the document. */ if (inptr[1] == 'N') { write_extra_bytes_to_output ("See", 3); in_parentheses = 1; } else if (in_parentheses) { write_extra_bytes_to_output ("see", 3); /* Only output the "see" for input like "(*note ...)", which would have come from a use of @pxref. We used to output "see" for "*note" in more circumstances, with a list of words where to suppress it (to avoid "see *note" turning into "see see"), but such a list can't be complete or reliable. It's better to remove it with more enthusiasm, then if the document writer wants a "see" to appear, they can add one themselves. */ } skip_input (strlen ("*Note")); if (!in_parentheses) skip_input (skip_whitespace (inptr)); } /* Copy any white space before label. */ copy_input_to_output (skip_whitespace_and_newlines (inptr)); return 1; } /* Output reference label and update ENTRY. INPTR should be on the first non-whitespace byte of label when this function is called. It is left at the first character after the colon terminating the label. Return 0 if invalid syntax is encountered. */ static int scan_reference_label (REFERENCE *entry, int in_index) { int max_lines; int len, label_len = 0; /* Handle case of cross-reference like (FILE)NODE::. */ if (inptr[0] == '(' && !in_index) label_len = read_bracketed_filename (inptr, &entry->filename); /* Search forward to ":" to get label name. Cross-references may have a newline in the middle. */ if (entry->type == REFERENCE_MENU_ITEM) max_lines = 1; else max_lines = 2; if (!in_index || inptr[label_len] == '\177') { len = read_quoted_string (inptr + label_len, ":", max_lines, &entry->nodename); canonicalize_whitespace (entry->nodename); if (!len) return 0; /* Input invalid. */ label_len += len; } else { /* If in an index node, go forward to the last colon on the line (not preceded by a newline, NUL or DEL). This is in order to support index entries containing colons. This should work fine as long as the node name does not contain a colon as well. */ char *p; int n, m = 0; p = inptr + label_len; while (1) { n = strcspn (p, ":\n\177"); if (p[n] == ':') { m += n + 1; p += n + 1; continue; } break; } if (m == 0) return 0; /* no : found */ label_len += m - 1; } #if HAVE_ICONV if (iconv_to_output != (iconv_t) -1 && iconv_to_output != (iconv_t) 0) { static struct text_buffer label_text; size_t iconv_ret; size_t inbytesleft = label_len; char *p = inptr; text_buffer_reset (&label_text); text_buffer_alloc (&label_text, label_len); while (1) { iconv_ret = text_buffer_iconv (&label_text, iconv_to_output, (ICONV_CONST char **)&p, &inbytesleft); /* Make sure libiconv flushes out the last converted character. */ if (iconv_ret != (size_t) -1 && text_buffer_iconv (&label_text, iconv_to_output, NULL, NULL) != (size_t) -1) break; /* Success: all of input converted. */ /* There's been an error while converting. */ goto no_convert; } text_buffer_add_char (&label_text, '\0'); entry->label = strdup (label_text.base); } else #endif { no_convert: entry->label = xmalloc (label_len + 1); memcpy (entry->label, inptr, label_len); entry->label[label_len] = '\0'; } canonicalize_whitespace (entry->label); if (preprocess_nodes_p) entry->start = text_buffer_off (&output_buf); /* Write text of label. */ copy_input_to_output (label_len); if (rewrite_p) entry->end = text_buffer_off (&output_buf); else entry->end = inptr - input_start; /* Colon after label. */ if (*inptr) skip_input (1); /* Don't mess up the margin of a menu description. */ if (entry->type == REFERENCE_MENU_ITEM) write_extra_bytes_to_output (" ", 1); return 1; } /* INPTR should be at the first character after the colon terminating the label. Return 0 on syntax error. */ static int scan_reference_target (REFERENCE *entry, NODE *node, int in_parentheses) { int i; /* This entry continues with a specific target. Parse the file name and node name from the specification. */ if (entry->type == REFERENCE_XREF) { int length = 0; /* Length of specification */ char *target_start = inptr; char *nl_off = 0; int space_at_start_of_line = 0; length += skip_whitespace_and_newlines (inptr); length += read_bracketed_filename (inptr + length, &entry->filename); length += skip_whitespace_and_newlines (inptr + length); /* Get the node name. */ length += read_quoted_string (inptr + length, ",.", 2, &entry->nodename); skip_input (length); /* Check if there is a newline in the target. */ nl_off = strchr (target_start, '\n'); if (nl_off) { if (nl_off < inptr) space_at_start_of_line = skip_whitespace (nl_off + 1); else nl_off = 0; } canonicalize_whitespace (entry->nodename); if (entry->filename) { /* Heuristic of whether it's worth outputing a newline before the filename. This checks whether the newline appears more than half way through the text, and therefore which side is longer. */ if (nl_off && nl_off < target_start + (length - space_at_start_of_line) / 2) { int i; write_extra_bytes_to_output ("\n", 1); for (i = 0; i < space_at_start_of_line; i++) write_extra_bytes_to_output (" ", 1); skip_input (strspn (inptr, " ")); nl_off = 0; } else if (*inptr != '\n') { write_extra_bytes_to_output (" ", 1); } write_extra_bytes_to_output ("(", 1); write_extra_bytes_to_output (entry->filename, strlen (entry->filename)); write_extra_bytes_to_output (" manual)", strlen (" manual)")); } /* Hide terminating punctuation if we are in a reference like "(*note Label:(file)node.)". */ if (in_parentheses && inptr[0] == '.') skip_input (1); /* Copy any terminating punctuation before the optional newline. */ copy_input_to_output (strspn (inptr, ".),")); /* Output a newline if one is needed. Don't do it at the end of a paragraph. */ if (nl_off && *inptr != '\n') { int i; write_extra_bytes_to_output ("\n", 1); for (i = 0; i < space_at_start_of_line; i++) write_extra_bytes_to_output (" ", 1); skip_input (strspn (inptr, " ")); } } else /* entry->type == REFERENCE_MENU_ITEM */ { int line_len; int length = 0; /* Length of specification */ length = skip_whitespace (inptr); length += read_bracketed_filename (inptr + length, &entry->filename); length += strspn (inptr + length, " "); /* Get the node name. */ length += read_quoted_string (inptr + length, ",.\t\n", 2, &entry->nodename); if (inptr[length] == '.') /* A '.' terminating the entry. */ length++; canonicalize_whitespace (entry->nodename); if (node->flags & N_IsDir) { /* Set line_len to length of line so far. */ char *linestart; linestart = memrchr (input_start, '\n', inptr - input_start); if (!linestart) linestart = input_start; else linestart++; /* Point to first character after newline. */ line_len = inptr - linestart; } if (node->flags & N_IsIndex) /* Show the name of the node the index entry refers to. */ copy_input_to_output (length); else { skip_input (length); if ((node->flags & N_IsDir) && inptr[strspn (inptr, " ")] == '\n') { /* For a dir node, if there is no more text in this line, check if there is a menu entry description in the next line to the right of the end of the label, and display it in this line. */ skip_input (strspn (inptr, " ")); if (line_len <= strspn (inptr + 1, " ")) skip_input (1 + line_len); } else { for (i = 0; i < length; i++) write_extra_bytes_to_output (" ", 1); } } /* Parse "(line ...)" part of menus, if any. */ { char *lineptr = inptr; /* Skip any whitespace first, and then a newline in case the item was so long to contain the ``(line ...)'' string in the same physical line. */ lineptr += skip_whitespace (inptr); if (*lineptr == '\n') lineptr += 1 + skip_whitespace (lineptr + 1); if (!strncmp (lineptr, "(line ", strlen ("(line "))) { lineptr += strlen ("(line "); entry->line_number = strtol (lineptr, 0, 0); } else entry->line_number = 0; } } return 1; } /* BASE is earlier in a block of allocated memory than PTR, and the block extends until at least BASE + LEN - 1. Return PTR[INDEX], unless this could be outside the allocated block, in which case return 0. */ static char safe_string_index (char *ptr, long index, char *base, long len) { long offset = ptr - base; if ( offset + index < 0 || offset + index >= len) return 0; return ptr[index]; } /* Process an in index marker ("^@^H[index^@^H]") or an image marker ("^@^H[image ...^@^H]"). */ static void scan_info_tag (NODE *node, int *in_index, FILE_BUFFER *fb) { char *p, *p1; struct text_buffer *expansion = xmalloc (sizeof (struct text_buffer)); p = inptr; p1 = p; text_buffer_init (expansion); if (tag_expand (&p1, input_start + input_length, expansion, in_index)) { if (*in_index) node->flags |= N_IsIndex; if (!rewrite_p) { rewrite_p = 1; init_output_stream (fb); /* Put inptr back to start so that copy_input_to_output below gets all preceding contents. */ inptr = node->contents; } /* Write out up to tag. */ copy_input_to_output (p - inptr); write_tag_contents (text_buffer_base (expansion), text_buffer_off (expansion)); /* Skip past body of tag. */ skip_tag_contents (p1 - inptr); } else { /* It was not a valid tag. */ copy_input_to_output (p - inptr + 1); } text_buffer_free (expansion); free (expansion); } #define looking_at_string(contents, string) \ (!strncasecmp (contents, string, strlen (string))) static char * forward_to_info_syntax (char *contents) { /* Loop until just before the end of the input. The '- 3' prevents us accessing memory after the end of the input, and none of the strings we are looking for are shorter than 3 bytes. */ while (contents < input_start + input_length - 3) { /* Menu entry comes first to optimize for the case of looking through a long index node. */ if (looking_at_string (contents, INFO_MENU_ENTRY_LABEL) || looking_at_string (contents, INFO_XREF_LABEL) || !memcmp (contents, "\0\b[", 3)) return contents; contents++; } return 0; } /* Scan contents of NODE, recording cross-references and similar. Convert character encoding of node contents to that of the user if the two are known to be different. If PREPROCESS_NODES_P == 1, remove Info syntax in contents. If FB is non-null, it is the file containing the node, and TAG_PTR is an offset into FB->tags. If the node contents are rewritten, adjust anchors that occur in the node and store adjusted value as TAG->nodestart_adjusted, otherwise simply copy TAG->nodestart to TAG->nodestart_adjusted for each anchor in the node. */ void scan_node_contents (NODE *node, FILE_BUFFER *fb, TAG **tag_ptr) { int in_menu = 0; char *match; REFERENCE **refs = NULL; size_t refs_index = 0, refs_slots = 0; /* Whether an index tag was seen. */ int in_index = 0; rewrite_p = preprocess_nodes_p; init_output_stream (fb); if (fb) { char *file_contents; /* Set anchor_to_adjust to first anchor in node, if any. */ anchor_to_adjust = tag_ptr + 1; if (!*anchor_to_adjust) anchor_to_adjust = 0; else if (*anchor_to_adjust && (*anchor_to_adjust)->cache.nodelen != 0) anchor_to_adjust = 0; if (!node->subfile) file_contents = fb->contents; else { FILE_BUFFER *f = info_find_subfile (node->subfile); if (!f) return; /* This shouldn't happen. */ file_contents = f->contents; } node_offset = (*tag_ptr)->nodestart + skip_node_separator (file_contents + (*tag_ptr)->nodestart); } else anchor_to_adjust = 0; /* Initialize refs to point to array of one null pointer in case there are no results. This way we know if refs has been initialized even if it is empty. */ refs = calloc (1, sizeof *refs); refs_slots = 1; parse_top_node_line (node); /* This should be the only time we assign to inptr in this function - all other assignment should be done with the helper functions above. */ inptr = node->contents; input_start = node->contents; input_length = node->nodelen; while ((match = forward_to_info_syntax (inptr)) && match < node->contents + node->nodelen) { int in_parentheses = 0; REFERENCE *entry; /* Write out up to match */ copy_input_to_output (match - inptr); if ((in_menu && match[0] == '\n') || match[0] == '*') { /* Menu entry or cross reference. */ /* Create REFERENCE entity. */ entry = info_new_reference (0, 0); if (safe_string_index (inptr, -1, input_start, input_length) == '(' && safe_string_index (inptr, 1, input_start, input_length) == 'n') in_parentheses = 1; save_conversion_state (); if (!scan_reference_marker (entry, in_parentheses)) goto not_a_reference; if (!scan_reference_label (entry, in_index)) goto not_a_reference; /* If this reference entry continues with another ':' then the target of the reference is given by the label. */ if (*inptr == ':') { int label_len; skip_input (1); if (entry->type == REFERENCE_MENU_ITEM) write_extra_bytes_to_output (" ", 1); /* Remove the DEL bytes from a label like "(FOO)^?BAR^?::". */ label_len = strlen (entry->label); if (label_len >= 2 && entry->label[label_len - 1] == 0177) { char *p = strchr (entry->label, '\177'); memmove (p, p + 1, label_len - (p - entry->label) - 1); entry->label[label_len - 2] = '\0'; } } else { /* Proceed to read the rest of the reference. */ /* TODO: we should probably not allow references of the form "(file)node1:node2." or "(file1)node1:(file2)node2", so bail out here if entry->filename is non-null. */ free (entry->filename); entry->filename = 0; free (entry->nodename); entry->nodename = 0; if (!scan_reference_target (entry, node, in_parentheses)) goto not_a_reference; } if (0) { char *cur_inptr; not_a_reference: /* This is not a menu entry or reference. Do not add to our list. */ cur_inptr = inptr; reset_conversion (); copy_input_to_output (cur_inptr - inptr); info_reference_free (entry); continue; } add_pointer_to_array (entry, refs_index, refs, refs_slots, 50); } /* Was "* Menu:" seen? If so, search for menu entries hereafter. */ else if (!in_menu && !strncmp (match, INFO_MENU_LABEL, strlen (INFO_MENU_LABEL))) { in_menu = 1; skip_input (strlen ("\n* Menu:")); if (*inptr == '\n') skip_input (strspn (inptr, "\n") - 1); /* Keep one newline. */ } else if (match[0] == '\0') /* Info tag */ { scan_info_tag (node, &in_index, fb); } else copy_input_to_output (1); } /* If we haven't accidentally gone past the end of the node, write out the rest of it. */ if (inptr < node->contents + node->nodelen) copy_input_to_output ((node->contents + node->nodelen) - inptr); /* Null to terminate buffer. */ if (rewrite_p) text_buffer_add_string (&output_buf, "\0", 1); /* Free resources used in character encoding conversion. */ close_conversion (); node->references = refs; if (rewrite_p) { if (node->flags & N_WasRewritten) free (node->contents); node->contents = text_buffer_base (&output_buf); node->flags |= N_WasRewritten; /* output_buf.off is the offset of the next character to be written. Subtracting 1 gives the offset of our terminating null, that is, the length. */ node->nodelen = text_buffer_off (&output_buf) - 1; } else if (fb && tag_ptr) { /* Set nodestart_adjusted for all of the anchors in this node. */ tag_ptr++; while (*tag_ptr && (*tag_ptr)->cache.nodelen == 0) { (*tag_ptr)->nodestart_adjusted = (*tag_ptr)->nodestart - output_bytes_difference; tag_ptr++; } } }