/* scan.c -- scanning Info files and nodes

   Copyright 1993-2023 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   Originally written by Brian Fox. */

#include "info.h"
#include "session.h"
#include "scan.h"
#include "util.h"
#include "tag.h"

#include <langinfo.h>
#if HAVE_ICONV
# include <iconv.h>
#endif
#include <wchar.h>
#ifdef __MINGW32__
/* MinGW uses a replacement nl_langinfo, see pcterm.c.  */
# define nl_langinfo rpl_nl_langinfo
extern char * rpl_nl_langinfo (nl_item);
/* MinGW uses its own replacement wcwidth, see pcterm.c for the
   reasons.  Since Gnulib's wchar.h might redirect wcwidth to
   rpl_wcwidth, we explicitly undo that here.  */
#undef wcwidth
#endif

#ifdef __hpux
#define va_copy(ap1,ap2) memcpy((&ap1),(&ap2),sizeof(va_list))
#endif

/* Variable which holds the most recent filename parsed as a result of
   calling info_parse_xxx (). */
char *info_parsed_filename = NULL;

/* Variable which holds the most recent nodename parsed as a result of
   calling info_parse_xxx (). */
char *info_parsed_nodename = NULL;

/* Read a filename surrounded by "(" and ")", accounting for matching
   characters, and place it in *FILENAME if FILENAME is not null.  Return 
   length of read filename.  On error, set *FILENAME to null and return 0.  */
int
read_bracketed_filename (char *string, char **filename)
{
  register int i = 0;
  int count = 0; /* Level of nesting. */
  int first_close = -1; /* First ")" encountered. */

  if (*string != '(')
    return 0;

  string++;
  count = 1;
  for (i = 0; string[i]; i++)
    {
      if (string[i] == '(')
        count++;
      else if (string[i] == ')')
        {
          if (first_close == -1)
            first_close = i;

          count--;
          if (count == 0)
            break;
        } 
    }
  
  /* If string ended before brackets were balanced, take the first ")" as
     terminating the filename. */
  if (count > 0)
    {
      if (first_close == -1)
        {
          if (filename)
            *filename = 0;
          return 0;
        }
      i = first_close;
    }

  if (filename)
    {
      *filename = xcalloc (1, i + 1);
      memcpy (*filename, string, i);
    }

  return i + 2; /* Length of filename plus "(" and ")". */
}

/* Parse the filename and nodename out of STRING, saving in
   INFO_PARSED_FILENAME and INFO_PARSED_NODENAME.  These variables should not
   be freed by calling code.  If either is missing, the relevant variable is
   set to a null pointer. */ 
void
info_parse_node (char *string)
{
  int nodename_len;

  free (info_parsed_filename);
  free (info_parsed_nodename);
  info_parsed_filename = 0;
  info_parsed_nodename = 0;

  /* Special case of nothing passed.  Return nothing. */
  if (!string || !*string)
    return;

  string += skip_whitespace_and_newlines (string);

  string += read_bracketed_filename (string, &info_parsed_filename);

  /* Parse out nodename. */
  string += skip_whitespace_and_newlines (string);
  nodename_len = read_quoted_string (string, "", 0, &info_parsed_nodename);

  if (nodename_len != 0)
    {
      canonicalize_whitespace (info_parsed_nodename);
    }
}

/* Set *OUTPUT to a copy of the string starting at START and finishing at
   a character in TERMINATOR, unless START[0] == INFO_QUOTE, in which case
   copy string from START+1 until the next occurence of INFO_QUOTE.  If
   TERMINATOR is an empty string, finish at a null character.   LINES is
   the number of lines that the string can span.  If LINES is zero, there is no
   limit.  Return length of string including any quoting characters.  Return
   0 if input was invalid. */
long
read_quoted_string (char *start, char *terminator, int lines, char **output)
{
  long len;
  char *nl = 0, saved_char;

  if (lines)
    {
      int i;
      nl = start;
      for (i = 0; i < lines; i++)
        {
          nl = strchr (nl, '\n');
          if (!nl)
            break; /* End of input string reached. */
          nl++;
        }
      if (nl)
        {
          saved_char = *nl;
          *nl = '\0';
        }
    }

  if (start[0] != '\177')
    {
      len = strcspn (start, terminator);

      if (*terminator && !start[len])
        {
          len = 0;
          *output = 0;
        }
      else
        {
          *output = xmalloc (len + 1);
          strncpy (*output, start, len);
          (*output)[len] = '\0';
        }
    }
  else
    {
      len = strcspn (start + 1, "\177");

      if (*terminator && !(start + 1)[len])
        {
          /* No closing 177 byte. */
          len = 0;
          *output = 0;
        }
      else
        {
          *output = xmalloc (len + 1);
          strncpy (*output, start + 1, len);
          (*output)[len] = '\0';
          len += 2; /* Count the two 177 bytes. */
        }

    }

  if (nl)
    *nl = saved_char;
  return len;
}


/* **************************************************************** */
/*                                                                  */
/*                  Finding and Building Menus                      */
/*                                                                  */
/* **************************************************************** */

/* Get the entry associated with LABEL in the menu of NODE.  Return a
   pointer to the ENTRY if found, or null.  Return value should not
   be freed by caller.  If SLOPPY, allow initial matches, like
   "Buffers" for a LABEL "buffer". */
REFERENCE *
info_get_menu_entry_by_label (NODE *node, char *label, int sloppy) 
{
  register int i;
  int best_guess = -1;
  REFERENCE *entry;
  REFERENCE **references = node->references;

  if (!references)
    return 0;

  for (i = 0; (entry = references[i]); i++)
    {
      if (entry->type != REFERENCE_MENU_ITEM)
        continue;
      if (mbscasecmp (label, entry->label) == 0)
        return entry; /* Exact, case-insensitive match. */
      else if (sloppy && best_guess == -1
               && (mbsncasecmp (entry->label, label, strlen (label)) == 0))
        best_guess = i;
    }

  if (sloppy && best_guess != -1)
    return references[best_guess];

  return 0;
}

/* A utility function for concatenating REFERENCE **.  Returns a new
   REFERENCE ** which is the concatenation of REF1 and REF2.  */
REFERENCE **
info_concatenate_references (REFERENCE **ref1, REFERENCE **ref2)
{
  register int i, j;
  REFERENCE **result;
  int size = 0;

  /* Get the total size of the slots that we will need. */
  if (ref1)
    {
      for (i = 0; ref1[i]; i++);
      size += i;
    }

  if (ref2)
    {
      for (i = 0; ref2[i]; i++);
      size += i;
    }

  result = xmalloc ((1 + size) * sizeof (REFERENCE *));

  /* Copy the contents over. */

  j = 0;
  if (ref1)
    {
      for (i = 0; ref1[i]; i++)
        result[j++] = ref1[i];
    }

  if (ref2)
    {
      for (i = 0; ref2[i]; i++)
        result[j++] = ref2[i];
    }

  result[j] = NULL;
  return result;
}

/* Copy a reference structure.  Copy each field into new memory.  */
REFERENCE *
info_copy_reference (REFERENCE *src)
{
  REFERENCE *dest = xmalloc (sizeof (REFERENCE));
  dest->label = src->label ? xstrdup (src->label) : NULL;
  dest->filename = src->filename ? xstrdup (src->filename) : NULL;
  dest->nodename = src->nodename ? xstrdup (src->nodename) : NULL;
  dest->start = src->start;
  dest->end = src->end;
  dest->line_number = src->line_number;
  dest->type = src->type;
  
  return dest;
}

/* Copy a list of references, copying in reference in turn with
   info_copy_reference. */
REFERENCE **
info_copy_references (REFERENCE **ref1)
{
  int i;
  REFERENCE **result;
  int size;

  if (!ref1)
    return 0;

  /* Get the total size of the slots that we will need. */
  for (i = 0; ref1[i]; i++);
  size = i;

  result = xmalloc ((1 + size) * sizeof (REFERENCE *));

  /* Copy the contents over. */
  for (i = 0; ref1[i]; i++)
    result[i] = info_copy_reference (ref1[i]);
  result[i] = NULL;

  return result;
}

void
info_reference_free (REFERENCE *ref)
{
  if (ref)
    {
      free (ref->label);
      free (ref->filename);
      free (ref->nodename);
      free (ref);
    }
}

/* Free the data associated with REFERENCES. */
void
info_free_references (REFERENCE **references)
{
  register int i;
  REFERENCE *entry;

  if (references)
    {
      for (i = 0; references && (entry = references[i]); i++)
        info_reference_free (entry);

      free (references);
    }
}

/* Return new REFERENCE with filename and nodename fields set. */
REFERENCE *
info_new_reference (char *filename, char *nodename)
{
  REFERENCE *r = xmalloc (sizeof (REFERENCE));
  r->label = 0;
  r->filename = filename ? xstrdup (filename) : 0;
  r->nodename = nodename ? xstrdup (nodename) : 0;
  r->start = 0;
  r->end = 0;
  r->line_number = 0;
  r->type = 0;
  return r;
}


/* Search for sequences of whitespace or newlines in STRING, replacing
   all such sequences with just a single space.  Remove whitespace from
   start and end of string. */
void
canonicalize_whitespace (char *string)
{
  register int i, j;
  int len, whitespace_found, whitespace_loc = 0;
  char *temp;

  if (!string)
    return;

  len = strlen (string);
  temp = xmalloc (1 + len);

  /* Search for sequences of whitespace or newlines.  Replace all such
     sequences in the string with just a single space. */

  whitespace_found = 0;
  for (i = 0, j = 0; string[i]; i++)
    {
      if (whitespace_or_newline (string[i]))
        {
          whitespace_found++;
          whitespace_loc = i;
          continue;
        }
      else
        {
          if (whitespace_found && whitespace_loc)
            {
              whitespace_found = 0;

              /* Suppress whitespace at start of string. */
              if (j)
                temp[j++] = ' ';
            }

          temp[j++] = string[i];
        }
    }

  /* Kill trailing whitespace. */
  if (j && whitespace (temp[j - 1]))
    j--;

  temp[j] = '\0';
  strcpy (string, temp);
  free (temp);
}


/* **************************************************************** */
/*                                                                  */
/*                          Scanning node                           */
/*                                                                  */
/* **************************************************************** */

/* Whether to strip syntax from the text of nodes. */
int preprocess_nodes_p;

/* Whether contents of nodes should be rewritten. */
static int rewrite_p;

/* inptr is moved forward through the body of a node. */
static char *inptr;

/* Pointer to first byte of node (after node separator). */
static char *input_start;

/* Number of bytes in node contents. */
static size_t input_length;

struct text_buffer output_buf;

/* Pointer into a tags table for the file to the anchor we need to adjust as
   a result of byte counts changing due to character encoding conversion or
   inserted/deleted text. */
static TAG **anchor_to_adjust;
/* Offset within file buffer of first byte of node, used for anchor
   adjustment. */
static int node_offset;

/* Difference so far between the number of bytes input in the file and
   bytes output.  Used to adjust the values of anchors in nodes. */
static long int output_bytes_difference;

/* Whether we are converting the character encoding of the file. */
static int convert_encoding_p;

#if HAVE_ICONV

/* Whether text in file is encoded in UTF-8. */
static int file_is_in_utf8;

/* Used for conversion from file encoding to output encoding. */
static iconv_t iconv_to_output;

/* Conversion from file encoding to UTF-8. */
static iconv_t iconv_to_utf8;

#endif /* HAVE_ICONV */

void
init_conversion (FILE_BUFFER *fb)
{
  char *target_encoding;

  convert_encoding_p = 0;

  /* Node being processed does not come from an Info file. */
  if (!fb)
    return;

#if !HAVE_ICONV
  return;
#else
  file_is_in_utf8 = 0;

  /* Don't process file if encoding is unknown. */
  if (!fb->encoding)
    return;

  /* Read name of character encoding from environment locale */
  target_encoding = nl_langinfo (CODESET);

  /* Don't convert the contents if the locale
     uses the same character encoding as the file */
  if (!strcasecmp(target_encoding, fb->encoding))
    return;

  /* Check if an iconv conversion from file locale to system
     locale exists */
  iconv_to_output = iconv_open (target_encoding, fb->encoding);
  if (iconv_to_output == (iconv_t) -1)
    return; /* Return if no conversion function implemented */

  if (   !strcasecmp ("UTF8",  fb->encoding)
      || !strcasecmp ("UTF-8", fb->encoding))
    file_is_in_utf8 = 1;

  if (!file_is_in_utf8)
    {
      iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding);
      if (iconv_to_utf8 == (iconv_t) -1)
        {
          /* Return if no conversion function implemented */
          iconv_close (iconv_to_output);
          iconv_to_output = (iconv_t) -1;
          return; 
        }
    }

  convert_encoding_p = 1;
  rewrite_p = 1;
#endif /* HAVE_ICONV */
}

void close_conversion (void)
{
#if HAVE_ICONV
  if (convert_encoding_p)
    {
      iconv_close (iconv_to_output);
      iconv_to_output = (iconv_t) -1;
      if (!file_is_in_utf8) iconv_close (iconv_to_utf8);
    }
#endif
}

static void
init_output_stream (FILE_BUFFER *fb)
{
  init_conversion (fb);
  output_bytes_difference = 0;

  if (rewrite_p)
    text_buffer_init (&output_buf);
}

static size_t saved_offset;
static char *saved_inptr;
static long saved_difference;

void
save_conversion_state (void)
{
  saved_offset = text_buffer_off (&output_buf);
  saved_inptr = inptr;
  saved_difference = output_bytes_difference;
}

/* Go back to the saved state of the output stream. */
void
reset_conversion (void)
{
  text_buffer_off (&output_buf) = saved_offset;
  inptr = saved_inptr;
  output_bytes_difference = saved_difference;
}

/* Copy bytes from input to output with no encoding conversion. */
static void
copy_direct (long n)
{
  text_buffer_add_string (&output_buf, inptr, n);
  inptr += n;
}

/* Read one character at *FROM and write out a sequence
   of bytes representing that character in ASCII.  *FROM
   is advanced past the read character. */
static int
degrade_utf8 (char **from, size_t *from_left)
{
  static struct encoding_replacement
  {
    char *from_string;
    char *to_string;
  } er[] = {
    {"\xE2\x80\x98","'"}, /* Opening single quote */
    {"\xE2\x80\x99","'"}, /* Closing single quote */
    {"\xE2\x80\x9C","\""},/* Opening double quote */
    {"\xE2\x80\x9D","\""},/* Closing double quote */
    {"\xC2\xA9","(C)"},   /* Copyright symbol */
    {"\xC2\xBB",">>"},    /* Closing double angle brackets */

    {"\xE2\x86\x92","->"},/* Right arrow */
    {"\xE2\x87\x92","=>"},/* Right double arrow */
    {"\xE2\x8A\xA3","-|"},/* Print symbol */
    {"\xE2\x98\x85","-!-"}, /* Point symbol */
    {"\xE2\x86\xA6","==>"}, /* Expansion symbol */

    {"\xE2\x80\x90","-"},  /* Hyphen */
    {"\xE2\x80\x91","-"},  /* Non-breaking hyphen */
    {"\xE2\x80\x92","-"},  /* Figure dash */
    {"\xE2\x80\x93","-"},  /* En dash */
    {"\xE2\x80\x94","--"},  /* Em dash */
    {"\xE2\x88\x92","-"},  /* Minus sign */
    {"\xE2\x80\xA6","..."},  /* Ellipsis */
    {"\xE2\x80\xA2","*"},  /* Bullet */

    {"\xC3\xA0","a`"},   /* Lower case letter a with grave accent */
    {"\xC3\xA2","a^"},   /* Lower case letter a with circumflex */
    {"\xC3\xA4","a\""},  /* Lower case letter a with diaeresis */
    {"\xC3\xA6","ae"},   /* Lower case letter ae ligature */
    {"\xC3\xA9","e'"},   /* Lower case letter e with acute accent */
    {"\xC3\xA8","e`"},   /* Lower case letter e with grave accent */
    {"\xC3\xAA","e^"},   /* Lower case letter e with circumflex */
    {"\xC3\xAB","e\""},  /* Lower case letter e with diaeresis */
    {"\xC3\xB6","o\""},  /* Lower case letter o with diaeresis */
    {"\xC3\xBC","u\""},  /* Lower case letter u with diaeresis */
    {"\xC3\x84", "A\""},  /* Upper case letter A with diaeresis. */
    {"\xC3\x96", "O\""},  /* Upper case letter O with diaeresis. */
    {"\xC3\x9c", "U\""},  /* Upper case letter U with diaeresis. */

    {"\xC3\xB1","n~"},  /* Lower case letter n with tilde */
    {"\xC3\x87","C,"},  /* Upper case letter C with cedilla */
    {"\xC3\xA7","c,"},  /* Lower case letter c with cedilla */
    {"\xC3\x9f","ss"},  /* Lower case letter sharp s */

    {0, 0}
  };

  struct encoding_replacement *erp;

  for (erp = er; erp->from_string != 0; erp++)
    {
      /* Avoid reading past end of input. */
      int width = strlen (erp->from_string);
      if (width > *from_left)
        continue;

      if (!strncmp (erp->from_string, *from, width))
        {
          text_buffer_add_string (&output_buf, erp->to_string,
                                  strlen(erp->to_string));
          *from += width;
          *from_left -= width;
          return 1;
        }
    }

  /* Failing this, just print a question mark.  Maybe we should use SUB
     (^Z) (ASCII substitute character code) instead, or pass through the
     original bytes. */
  text_buffer_add_string (&output_buf, "?", 1);

  /* Ideally we would advance one UTF-8 character.  This would
     require knowing its length in bytes. */
  (*from)++;
  (*from_left)--;

  return 0;
}

/* Convert N bytes from input to output encoding and write to
   output buffer.  Return number of bytes over N written. */
static int
copy_converting (long n)
{
#if !HAVE_ICONV
  return 0;
#else
  size_t bytes_left, orig_bytes_left;
  int extra_at_end;
  size_t iconv_ret;
  long output_start;

  size_t utf8_char_free; 
  char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
  char *utf8_char_ptr, *orig_inptr;
  size_t i;
  
  /* Use n as an estimate of how many bytes will be required
     in target encoding. */
  text_buffer_alloc (&output_buf, (size_t) n);

  output_start = text_buffer_off (&output_buf);
  bytes_left = n;
  extra_at_end = 0;
  while (1)
    {
      iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output,
                                     (ICONV_CONST char **)&inptr, &bytes_left);

      /* Make sure libiconv flushes out the last converted character.
	 This is required when the conversion is stateful, in which
	 case libiconv might not output the last character, waiting to
	 see whether it should be combined with the next one.  */
      if (iconv_ret != (size_t) -1
	  && text_buffer_iconv (&output_buf, iconv_to_output,
				NULL, NULL) != (size_t) -1)
        /* Success: all of input converted. */
        break;

      /* There's been an error while converting. */
      switch (errno)
        {
        case EINVAL:
          /* Incomplete byte sequence at end of input buffer.  Try to read
             more. */

          /* input_length - 2 is offset of last-but-one byte within input.
             This checks if there is at least one more byte within node
             contents. */
          if (inptr - input_start + (bytes_left - 1) <= input_length - 2)
            {
              bytes_left++;
              extra_at_end++;
            }
          else
            {
              copy_direct (bytes_left);
              bytes_left = 0;
            }
          continue;
        default: /* Unknown error */
          info_error (_("Error converting file character encoding"));

          /* Skip past current input and hope we don't get an
             error next time. */
          inptr += bytes_left;
          return 0;
        case EILSEQ:
          /* Byte sequence in input not recognized.  Degrade to ASCII.  */
          break;
        }

      /* Flush any waiting input in iconv_to_output and enter the
         default shift state. */
      text_buffer_iconv (&output_buf, iconv_to_output, NULL, NULL);
      
      if (file_is_in_utf8)
        {
          degrade_utf8 (&inptr, &bytes_left);
          continue;     
        }

      /* If file is not in UTF-8, we degrade to ASCII in two steps:
         first convert the character to UTF-8, then look up a replacement
         string.  Note that mixing iconv_to_output and iconv_to_utf8
         on the same input may not work well if the input encoding
         is stateful.  We could deal with this by always converting to
         UTF-8 first; then we could mix conversions on the UTF-8 stream. */

      /* We want to read exactly one character.  Do this by
         restricting size of output buffer. */
      utf8_char_ptr = utf8_char;
      orig_inptr = inptr;
      orig_bytes_left = bytes_left;
      for (i = 1; i <= 4; i++)
        {
          utf8_char_free = i;
          errno = 0;
          iconv_ret = iconv (iconv_to_utf8, (ICONV_CONST char **)&inptr,
                             &bytes_left, &utf8_char_ptr, &utf8_char_free);
          if ((iconv_ret == (size_t) -1 && errno != E2BIG)
              /* If we managed to convert a character: */
              || utf8_char_ptr > utf8_char)
            break;
        }

      /* errno == E2BIG if iconv ran out of output buffer,
         which is expected. */
      if (iconv_ret == (size_t) -1 && errno != E2BIG)
	{
	  /* Character is not recognized.  Copy a single byte.  */
	  inptr = orig_inptr;	/* iconv might have incremented inptr  */
	  copy_direct (1);
	  bytes_left = orig_bytes_left - 1;
	}
      else
        {
          utf8_char_ptr = utf8_char;
          /* i is width of UTF-8 character */
          degrade_utf8 (&utf8_char_ptr, &i);
	  /* If we are done, make sure iconv flushes the last character.  */
	  if (bytes_left <= 0)
	    {
	      utf8_char_ptr = utf8_char;
	      i = 4;
	      iconv (iconv_to_utf8, NULL, NULL,
		     &utf8_char_ptr, &utf8_char_free);
	      if (utf8_char_ptr > utf8_char)
		{
		  utf8_char_ptr = utf8_char;
		  degrade_utf8 (&utf8_char_ptr, &i);
		}
	    }
        }
    }

  /* Must cast because the difference between unsigned size_t is always
     positive. */
  output_bytes_difference +=
    n - ((signed long) text_buffer_off (&output_buf) - output_start);

  return extra_at_end;
#endif /* HAVE_ICONV */
}

/* Functions below are named from the perspective of the preprocess_nodes_p
   flag being on. */

/* Copy text from input node contents, possibly converting the
   character encoding and adjusting anchor offsets at the same time. */
static void
copy_input_to_output (long n)
{
  if (rewrite_p)
    {
      long bytes_left;

      bytes_left = n;
      while (bytes_left > 0)
        {
          if (!convert_encoding_p)
            {
              copy_direct (bytes_left);
              bytes_left = 0;
            }
          else
            {
              long bytes_to_convert;
              long extra_written;

              bytes_to_convert = bytes_left;

              if (anchor_to_adjust)
                {
                  /* Check there is an anchor in the input. */
                  long first_anchor =
                    (*anchor_to_adjust)->nodestart - node_offset;

                  if (first_anchor < 0)
                    anchor_to_adjust = 0; /* error in input file */
                  else if (first_anchor < (inptr-input_start) + bytes_left)
                    {
                      /* Convert enough to pass the first anchor in input. */
                      bytes_to_convert = first_anchor - (inptr-input_start)+1;
                      if (bytes_to_convert < 0)
                        {
                          bytes_to_convert = bytes_left;
                          anchor_to_adjust = 0;
                        }
                    }
                }

              /* copy_converting may read more than bytes_to_convert
                 bytes if its input ends in an incomplete byte sequence. */
              extra_written = copy_converting (bytes_to_convert);

              bytes_left -= bytes_to_convert + extra_written;
            }

          /* Check if we have gone past any anchors and
             adjust with output_bytes_difference. */
          if (anchor_to_adjust)
            while ((*anchor_to_adjust)->nodestart - node_offset
                   <= inptr - input_start)
              {
                (*anchor_to_adjust)->nodestart_adjusted
                   = (*anchor_to_adjust)->nodestart - output_bytes_difference;

                anchor_to_adjust++;
                if (!*anchor_to_adjust
                    || (*anchor_to_adjust)->cache.nodelen != 0)
                  {
                    anchor_to_adjust = 0;
                    break;
                  }
              }
        }
    }
  else
    inptr += n;
}

static void
skip_input (long n)
{
  if (preprocess_nodes_p)
    {
      inptr += n;
      output_bytes_difference += n;
    }
  else if (rewrite_p)
    {
      /* We are expanding tags only.  Do not skip input. */
      copy_input_to_output (n);
    }
  else
    {
      inptr += n;
    }
}

static void
write_extra_bytes_to_output (char *input, long n)
{
  if (preprocess_nodes_p)
    {
      text_buffer_add_string (&output_buf, input, n);
      output_bytes_difference -= n;
    }
}

/* Like write_extra_bytes_to_output, but writes bytes even when
   preprocess_nodes=Off. */
static void
write_tag_contents (char *input, long n)
{
  if (rewrite_p)
    {
      text_buffer_add_string (&output_buf, input, n);
      output_bytes_difference -= n;
    }
}

/* Like skip_input, but skip even when !preprocess_nodes_p. */
static void
skip_tag_contents (long n)
{
  if (rewrite_p)
    {
      inptr += n;
      output_bytes_difference += n;
    }
}

/* Read first line of node and set next, prev and up. */
static void
parse_top_node_line (NODE *node)
{
  char **store_in = 0;
  char *nodename;
  char *ptr;
  int value_length;

  /* If the first line is empty, leave it in.  This is the case
     in the index-apropos window. */
  if (*node->contents == '\n')
    return;

  node->next = node->prev = node->up = 0;
  ptr = node->contents;

  while (1)
    {
      store_in = 0;

      ptr += skip_whitespace (ptr);

      /* Check what field we are looking at */
      if (!strncasecmp (ptr, INFO_FILE_LABEL, strlen(INFO_FILE_LABEL)))
        {
          ptr += strlen (INFO_FILE_LABEL);
        }
      else if (!strncasecmp (ptr, INFO_NODE_LABEL, strlen(INFO_NODE_LABEL)))
        {
          ptr += strlen (INFO_NODE_LABEL);
        }
      else if (!strncasecmp (ptr, INFO_PREV_LABEL, strlen(INFO_PREV_LABEL)))
        {
          ptr += strlen (INFO_PREV_LABEL);
          store_in = &node->prev;
        }
      else if (!strncasecmp (ptr, INFO_ALTPREV_LABEL, 
                             strlen(INFO_ALTPREV_LABEL)))
        {
          ptr += strlen (INFO_ALTPREV_LABEL);
          store_in = &node->prev;
        }
      else if (!strncasecmp (ptr, INFO_NEXT_LABEL, strlen(INFO_NEXT_LABEL)))
        {
          ptr += strlen (INFO_NEXT_LABEL);
          store_in = &node->next;
        }
      else if (!strncasecmp (ptr, INFO_UP_LABEL, strlen(INFO_UP_LABEL)))
        {
          ptr += strlen (INFO_UP_LABEL);
          store_in = &node->up;
        }
      else 
        {
          store_in = 0;
          /* Not recognized - code below will skip to next comma */
        }
      ptr += skip_whitespace (ptr);

      /* Get length of a bracketed filename component. */
      if (*ptr != '(')
        value_length = 0;
      else
        value_length = read_bracketed_filename (ptr, 0);

      /* Get length of node name, or filename if following "File:".  Note 
         that .  is not included in the second argument here in order to 
         support this character in file names. */
      value_length += read_quoted_string (ptr + value_length,
                                          "\n\r\t,", 1, &nodename);
      if (store_in)
        {
          *store_in = xmalloc (value_length + 1);
          strncpy (*store_in, ptr, value_length);
          (*store_in)[value_length] = '\0';
        }

      free (nodename);
      ptr += value_length;

      if (*ptr == '\n' || !*ptr)
        break;

      ptr += 1; /* Point after field terminator */
    }
}

/* Output, replace or hide text introducing a reference.  INPTR starts on
   the first byte of a sequence introducing a reference and finishes on the
   first (non-whitespace) byte of the reference label. */
static int
scan_reference_marker (REFERENCE *entry, int in_parentheses)
{
  /* When preprocess_nodes is Off, we position the cursor on
     the "*" when moving between references. */
  if (!preprocess_nodes_p)
    {
      if (rewrite_p)
        entry->start = text_buffer_off(&output_buf);
      else
        entry->start = inptr - input_start;
    }

  /* Check what we found based on first character of match */
  if (inptr[0] == '\n')
    {
      entry->type = REFERENCE_MENU_ITEM;
      if (!preprocess_nodes_p)
        entry->start++;
    }
  else
    entry->type = REFERENCE_XREF;

  if (entry->type == REFERENCE_MENU_ITEM)
    copy_input_to_output (strlen ("\n* "));
  else
    {
      /* Only match "*Note" if it is followed by a whitespace character so that 
         it will not be recognized if, e.g., it is surrounded in inverted 
         commas. */
      if (!strchr (" \t\r\n", inptr[strlen ("*Note")]))
        {
          copy_input_to_output (strlen ("*Note:"));
          return 0;
        }

      /* Cross-references can be generated by four different Texinfo
         commands.  @inforef and @xref output "*Note " in Info format,
         and "See" in HTML and print.  @ref and @pxref output "*note "
         in Info format, and either nothing at all or "see" in HTML
         and print.  Unfortunately, there is no easy way to distinguish
         between these latter two cases. */
      /* TODO: Internationalize these strings, but only if we know the
         language of the document. */
      if (inptr[1] == 'N')
        {
          write_extra_bytes_to_output ("See", 3);
          in_parentheses = 1;
        }
      else if (in_parentheses)
        {
          write_extra_bytes_to_output ("see", 3);
          /* Only output the "see" for input like "(*note ...)", which
             would have come from a use of @pxref.  We used to output "see" for 
             "*note" in more circumstances, with a list of words where to
             suppress it (to avoid "see *note" turning into "see see"), but
             such a list can't be complete or reliable.  It's better to remove 
             it with more enthusiasm, then if the document writer wants a "see"
             to appear, they can add one themselves. */
        }

      skip_input (strlen ("*Note"));
      if (!in_parentheses)
        skip_input (skip_whitespace (inptr));
    }

  /* Copy any white space before label. */
  copy_input_to_output (skip_whitespace_and_newlines (inptr));

  return 1;
}

/* Output reference label and update ENTRY.  INPTR should be on the first
   non-whitespace byte of label when this function is called.  It is left
   at the first character after the colon terminating the label.  Return 0 if
   invalid syntax is encountered. */
static int
scan_reference_label (REFERENCE *entry, int in_index)
{
  int max_lines;
  int len, label_len = 0;

  /* Handle case of cross-reference like (FILE)NODE::. */
  if (inptr[0] == '(' && !in_index)
    label_len = read_bracketed_filename (inptr, &entry->filename);

  /* Search forward to ":" to get label name.  Cross-references may have
     a newline in the middle. */
  if (entry->type == REFERENCE_MENU_ITEM)
    max_lines = 1;
  else
    max_lines = 2;
  if (!in_index || inptr[label_len] == '\177')
    {
      len = read_quoted_string (inptr + label_len, ":", max_lines,
                                &entry->nodename);
      canonicalize_whitespace (entry->nodename);
      if (!len)
        return 0; /* Input invalid. */
      label_len += len;
    }
  else
    {
      /* If in an index node, go forward to the last colon on the line
         (not preceded by a newline, NUL or DEL).  This is in order to
         support index entries containing colons.  This should work fine
         as long as the node name does not contain a colon as well. */

      char *p;
      int n, m = 0;
      p = inptr + label_len;

      while (1)
        {
          n = strcspn (p, ":\n\177");
          if (p[n] == ':')
            {
              m += n + 1;
              p += n + 1;
              continue;
            }
          break;
        }
      if (m == 0)
        return 0; /* no : found */
      label_len += m - 1;
    }

#if HAVE_ICONV
  if (iconv_to_output != (iconv_t) -1 && iconv_to_output != (iconv_t) 0)
    {
      static struct text_buffer label_text;
      size_t iconv_ret;
      size_t inbytesleft = label_len;
      char *p = inptr;
      text_buffer_reset (&label_text);
      text_buffer_alloc (&label_text, label_len);

      while (1)
        {
          iconv_ret = text_buffer_iconv (&label_text, iconv_to_output,
                                         (ICONV_CONST char **)&p,
                                         &inbytesleft);

          /* Make sure libiconv flushes out the last converted character. */
          if (iconv_ret != (size_t) -1
                && text_buffer_iconv (&label_text, iconv_to_output,
                       NULL, NULL) != (size_t) -1)
            break; /* Success: all of input converted. */

          /* There's been an error while converting. */
          goto no_convert;
        }

      text_buffer_add_char (&label_text, '\0');
      entry->label = strdup (label_text.base);
    }
  else
#endif
    {
  no_convert:
      entry->label = xmalloc (label_len + 1);
      memcpy (entry->label, inptr, label_len);
      entry->label[label_len] = '\0';
    }
  canonicalize_whitespace (entry->label);

  if (preprocess_nodes_p)
    entry->start = text_buffer_off (&output_buf);

  /* Write text of label. */
  copy_input_to_output (label_len);

  if (rewrite_p)
    entry->end = text_buffer_off (&output_buf);
  else
    entry->end = inptr - input_start;

  /* Colon after label. */
  if (*inptr)
    skip_input (1);
  /* Don't mess up the margin of a menu description. */
  if (entry->type == REFERENCE_MENU_ITEM)
    write_extra_bytes_to_output (" ", 1);

  return 1;
}

/* INPTR should be at the first character after the colon
   terminating the label.  Return 0 on syntax error. */
static int
scan_reference_target (REFERENCE *entry, NODE *node, int in_parentheses)
{
  int i;

  /* This entry continues with a specific target.  Parse the
     file name and node name from the specification. */

  if (entry->type == REFERENCE_XREF)
    {
      int length = 0; /* Length of specification */
      char *target_start = inptr;
      char *nl_off = 0;
      int space_at_start_of_line = 0;

      length += skip_whitespace_and_newlines (inptr);

      length += read_bracketed_filename (inptr + length, &entry->filename);

      length += skip_whitespace_and_newlines (inptr + length);

      /* Get the node name. */
      length += read_quoted_string (inptr + length, ",.", 2, &entry->nodename);

      skip_input (length);

      /* Check if there is a newline in the target. */
      nl_off = strchr (target_start, '\n');
      if (nl_off)
        {
          if (nl_off < inptr)
            space_at_start_of_line = skip_whitespace (nl_off + 1);
          else
            nl_off = 0;
        }
      canonicalize_whitespace (entry->nodename);

      if (entry->filename)
        {
          /* Heuristic of whether it's worth outputing a newline before the
             filename.  This checks whether the newline appears more
             than half way through the text, and therefore which side is
             longer. */
          if (nl_off
              && nl_off < target_start + (length - space_at_start_of_line) / 2)
            {
              int i;
              write_extra_bytes_to_output ("\n", 1);

              for (i = 0; i < space_at_start_of_line; i++)
                write_extra_bytes_to_output (" ", 1);
              skip_input (strspn (inptr, " "));
              nl_off = 0;
            }
          else

          if (*inptr != '\n')
            {
              write_extra_bytes_to_output (" ", 1);
            }
          write_extra_bytes_to_output ("(", 1);
          write_extra_bytes_to_output (entry->filename,
                                       strlen (entry->filename));
          write_extra_bytes_to_output (" manual)",
                                       strlen (" manual)"));
        }
      
      /* Hide terminating punctuation if we are in a reference
         like "(*note Label:(file)node.)". */
      if (in_parentheses && inptr[0] == '.')
        skip_input (1);

      /* Copy any terminating punctuation before the optional newline. */
      copy_input_to_output (strspn (inptr, ".),"));

      /* Output a newline if one is needed.  Don't do it at the end of
         a paragraph. */
      if (nl_off && *inptr != '\n')
        { 
          int i;

          write_extra_bytes_to_output ("\n", 1);
          for (i = 0; i < space_at_start_of_line; i++)
            write_extra_bytes_to_output (" ", 1);
          skip_input (strspn (inptr, " "));
        }
    }
  else /* entry->type == REFERENCE_MENU_ITEM */
    {
      int line_len;
      int length = 0; /* Length of specification */

      length = skip_whitespace (inptr);
      length += read_bracketed_filename (inptr + length, &entry->filename);
      length += strspn (inptr + length, " ");

      /* Get the node name. */
      length += read_quoted_string (inptr + length, ",.\t\n", 2, 
                                    &entry->nodename);
      if (inptr[length] == '.') /* A '.' terminating the entry. */
        length++;
      canonicalize_whitespace (entry->nodename);

      if (node->flags & N_IsDir)
        {
          /* Set line_len to length of line so far. */

          char *linestart;
          linestart = memrchr (input_start, '\n', inptr - input_start);
          if (!linestart)
            linestart = input_start;
          else
            linestart++; /* Point to first character after newline. */
          line_len = inptr - linestart;
        }

      if (node->flags & N_IsIndex)
        /* Show the name of the node the index entry refers to. */
        copy_input_to_output (length);
      else
        {
          skip_input (length);

          if ((node->flags & N_IsDir) && inptr[strspn (inptr, " ")] == '\n')
            {
              /* For a dir node, if there is no more text in this line,
                 check if there is a menu entry description in the next
                 line to the right of the end of the label, and display it
                 in this line. */
              skip_input (strspn (inptr, " "));
              if (line_len <= strspn (inptr + 1, " "))
                skip_input (1 + line_len);
            }
          else
            {
              for (i = 0; i < length; i++)
                write_extra_bytes_to_output (" ", 1);
            }
        }

      /* Parse "(line ...)" part of menus, if any.  */
      {
        char *lineptr = inptr;
        /* Skip any whitespace first, and then a newline in case the item
           was so long to contain the ``(line ...)'' string in the same
           physical line.  */
        lineptr += skip_whitespace (inptr);
        if (*lineptr == '\n')
          lineptr += 1 + skip_whitespace (lineptr + 1);

        if (!strncmp (lineptr, "(line ", strlen ("(line ")))
          {
            lineptr += strlen ("(line ");
            entry->line_number = strtol (lineptr, 0, 0);
          }
        else
          entry->line_number = 0;
      }
    }

  return 1;
}

/* BASE is earlier in a block of allocated memory than PTR, and the block
   extends until at least BASE + LEN - 1.  Return PTR[INDEX], unless this
   could be outside the allocated block, in which case return 0. */
static char
safe_string_index (char *ptr, long index, char *base, long len)
{
  long offset = ptr - base;

  if (   offset + index < 0
      || offset + index >= len)
    return 0;

  return ptr[index];
}

/* Process an in index marker ("^@^H[index^@^H]") or an image marker
   ("^@^H[image ...^@^H]"). */
static void
scan_info_tag (NODE *node, int *in_index, FILE_BUFFER *fb)
{
  char *p, *p1;
  struct text_buffer *expansion = xmalloc (sizeof (struct text_buffer));

  p = inptr;
  p1 = p;

  text_buffer_init (expansion);

  if (tag_expand (&p1, input_start + input_length, expansion, in_index))
    {
      if (*in_index)
        node->flags |= N_IsIndex;

      if (!rewrite_p)
        {
          rewrite_p = 1;
          init_output_stream (fb);

          /* Put inptr back to start so that
             copy_input_to_output below gets all
             preceding contents. */
          inptr = node->contents;
        }

      /* Write out up to tag. */
      copy_input_to_output (p - inptr);

      write_tag_contents (text_buffer_base (expansion),
                          text_buffer_off (expansion));
      /* Skip past body of tag. */
      skip_tag_contents (p1 - inptr);
    }
  else
    {
      /* It was not a valid tag. */ 
      copy_input_to_output (p - inptr + 1);
    }

  text_buffer_free (expansion);
  free (expansion);
}

#define looking_at_string(contents, string) \
  (!strncasecmp (contents, string, strlen (string)))

static char *
forward_to_info_syntax (char *contents)
{
  /* Loop until just before the end of the input.  The '- 3' prevents us
     accessing memory after the end of the input, and none of the strings we 
     are looking for are shorter than 3 bytes. */
  while (contents < input_start + input_length - 3)
    {
      /* Menu entry comes first to optimize for the case of looking through a 
         long index node. */
      if (looking_at_string (contents, INFO_MENU_ENTRY_LABEL)
          || looking_at_string (contents, INFO_XREF_LABEL)
          || !memcmp (contents, "\0\b[", 3))
        return contents;
      contents++;
    }
  return 0;
}

/* Scan contents of NODE, recording cross-references and similar.

   Convert character encoding of node contents to that of the user if the two 
   are known to be different.  If PREPROCESS_NODES_P == 1, remove Info syntax 
   in contents.

   If FB is non-null, it is the file containing the node, and TAG_PTR is an 
   offset into FB->tags.  If the node contents are rewritten, adjust anchors
   that occur in the node and store adjusted value as TAG->nodestart_adjusted, 
   otherwise simply copy TAG->nodestart to TAG->nodestart_adjusted for each 
   anchor in the node. */
void
scan_node_contents (NODE *node, FILE_BUFFER *fb, TAG **tag_ptr)
{
  int in_menu = 0;
  char *match;

  REFERENCE **refs = NULL;
  size_t refs_index = 0, refs_slots = 0;

  /* Whether an index tag was seen. */
  int in_index = 0;

  rewrite_p = preprocess_nodes_p;

  init_output_stream (fb);

  if (fb)
    {
      char *file_contents;

      /* Set anchor_to_adjust to first anchor in node, if any. */
      anchor_to_adjust = tag_ptr + 1;
      if (!*anchor_to_adjust)
        anchor_to_adjust = 0;
      else if (*anchor_to_adjust
               && (*anchor_to_adjust)->cache.nodelen != 0)
        anchor_to_adjust = 0;

      if (!node->subfile)
        file_contents = fb->contents;
      else
        {
          FILE_BUFFER *f = info_find_subfile (node->subfile);
          if (!f)
            return; /* This shouldn't happen. */
          file_contents = f->contents;
        }
      node_offset = (*tag_ptr)->nodestart
        + skip_node_separator (file_contents + (*tag_ptr)->nodestart);
    }
  else
    anchor_to_adjust = 0;

  /* Initialize refs to point to array of one null pointer in case
     there are no results.  This way we know if refs has been initialized
     even if it is empty. */
  refs = calloc (1, sizeof *refs);
  refs_slots = 1;

  parse_top_node_line (node);

  /* This should be the only time we assign to inptr in this function -
     all other assignment should be done with the helper functions above. */
  inptr = node->contents;
  input_start = node->contents;
  input_length = node->nodelen;


  while ((match = forward_to_info_syntax (inptr))
          && match < node->contents + node->nodelen)
    {
      int in_parentheses = 0;
      REFERENCE *entry;

      /* Write out up to match */
      copy_input_to_output (match - inptr); 

      if ((in_menu && match[0] == '\n') || match[0] == '*')
        {
          /* Menu entry or cross reference. */
          /* Create REFERENCE entity. */
          entry = info_new_reference (0, 0);

          if (safe_string_index (inptr, -1, input_start, input_length) == '('
             && safe_string_index (inptr, 1, input_start, input_length) == 'n')
            in_parentheses = 1;

          save_conversion_state ();
          
          if (!scan_reference_marker (entry, in_parentheses))
            goto not_a_reference;

          if (!scan_reference_label (entry, in_index))
            goto not_a_reference;

          /* If this reference entry continues with another ':' then the target
             of the reference is given by the label. */
          if (*inptr == ':')
            {
              int label_len;
              skip_input (1);
              if (entry->type == REFERENCE_MENU_ITEM)
                write_extra_bytes_to_output (" ", 1);

              /* Remove the DEL bytes from a label like "(FOO)^?BAR^?::". */
              label_len = strlen (entry->label);
              if (label_len >= 2 && entry->label[label_len - 1] == 0177)
                {
                  char *p = strchr (entry->label, '\177');
                  memmove (p, p + 1, label_len - (p - entry->label) - 1);
                  entry->label[label_len - 2] = '\0';
                }
            }
          else
            {
              /* Proceed to read the rest of the reference. */
              /* TODO: we should probably not allow references of the form 
                 "(file)node1:node2." or "(file1)node1:(file2)node2", so
                 bail out here if entry->filename is non-null. */

              free (entry->filename); entry->filename = 0;
              free (entry->nodename); entry->nodename = 0;
              if (!scan_reference_target (entry, node, in_parentheses))
                goto not_a_reference;
            }

          if (0)
            {
              char *cur_inptr;

not_a_reference:
              /* This is not a menu entry or reference.  Do not add to our 
                 list. */
              cur_inptr = inptr;
              reset_conversion ();
              copy_input_to_output (cur_inptr - inptr);

              info_reference_free (entry);
              continue;
            }

          add_pointer_to_array (entry, refs_index, refs, refs_slots, 50);
        }
      /* Was "* Menu:" seen?  If so, search for menu entries hereafter. */
      else if (!in_menu && !strncmp (match, INFO_MENU_LABEL,
                               strlen (INFO_MENU_LABEL)))
        {
          in_menu = 1;
          skip_input (strlen ("\n* Menu:"));
          if (*inptr == '\n')
            skip_input (strspn (inptr, "\n") - 1); /* Keep one newline. */

        }
      else if (match[0] == '\0') /* Info tag */
        {
          scan_info_tag (node, &in_index, fb);
        }
      else
        copy_input_to_output (1);
    }

  /* If we haven't accidentally gone past the end of the node, write
     out the rest of it. */
  if (inptr < node->contents + node->nodelen)
    copy_input_to_output ((node->contents + node->nodelen) - inptr); 

  /* Null to terminate buffer. */
  if (rewrite_p)
    text_buffer_add_string (&output_buf, "\0", 1);

  /* Free resources used in character encoding conversion. */
  close_conversion ();
  
  node->references = refs;

  if (rewrite_p)
    {
      if (node->flags & N_WasRewritten)
        free (node->contents);
      node->contents = text_buffer_base (&output_buf);
      node->flags |= N_WasRewritten;
 
      /* output_buf.off is the offset of the next character to be
         written.  Subtracting 1 gives the offset of our terminating
         null, that is, the length. */
      node->nodelen = text_buffer_off (&output_buf) - 1;
    }
  else if (fb && tag_ptr)
    {
      /* Set nodestart_adjusted for all of the anchors in this node. */
      tag_ptr++;
      while (*tag_ptr && (*tag_ptr)->cache.nodelen == 0)
        {
          (*tag_ptr)->nodestart_adjusted = (*tag_ptr)->nodestart
                                             - output_bytes_difference;
          tag_ptr++;
        }
    }
}