/* Copyright 2010-2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include "errors.h" #include "debug.h" #include "input.h" #include "text.h" #include "commands.h" #include "source_marks.h" enum input_type { IN_file, IN_text }; typedef struct { enum input_type type; FILE *file; SOURCE_INFO source_info; char *input_file_path; /* for IN_file type, the full input file path */ char *text; /* Input text to be parsed as Texinfo. */ char *ptext; /* How far we are through 'text'. Used to split 'text' into lines. */ char *value_flag; /* value flag if the input text is a @value expansion */ char *macro_name; /* macro name if the input text is a user-defined macro expansion */ SOURCE_MARK *input_source_mark; } INPUT; static char *input_pushback_string; static iconv_t reverse_iconv; /* used in encode_file_name */ typedef struct { char *encoding_name; iconv_t iconv; } ENCODING_CONVERSION; static ENCODING_CONVERSION *encodings_list = 0; int encoding_number = 0; int encoding_space = 0; char *global_input_encoding_name = 0; static ENCODING_CONVERSION *current_encoding_conversion = 0; /* ENCODING should always be lower cased */ /* WARNING: it is very important for the first call to set_input_encoding to be for "utf-8" as the codes assume a conversion to UTF-8 in encodings_list[0]. */ int set_input_encoding (char *encoding) { int encoding_index = -1; int encoding_set = 0; char *conversion_encoding = encoding; /* should correspond to Texinfo::Common::encoding_name_conversion_map. Thoughts on this mapping are available near Texinfo::Common::encoding_name_conversion_map definition */ if (!strcmp (encoding, "us-ascii")) conversion_encoding = "iso-8859-1"; if (reverse_iconv) { iconv_close (reverse_iconv); reverse_iconv = (iconv_t) 0; } if (!strcmp (encoding, "utf-8")) { if (encoding_number > 0) encoding_index = 0; } else if (encoding_number > 1) { int i; for (i = 1; i < encoding_number; i++) { if (!strcmp (encoding, encodings_list[i].encoding_name)) { encoding_index = i; break; } } } if (encoding_index == -1) { if (encoding_number >= encoding_space) { encodings_list = realloc (encodings_list, (encoding_space += 3) * sizeof (ENCODING_CONVERSION)); } encodings_list[encoding_number].encoding_name = strdup (conversion_encoding); /* Initialize conversions for the first time. iconv_open returns (iconv_t) -1 on failure so these should only be called once. */ encodings_list[encoding_number].iconv = iconv_open ("UTF-8", conversion_encoding); encoding_index = encoding_number; encoding_number++; } if (encodings_list[encoding_index].iconv == (iconv_t) -1) current_encoding_conversion = 0; else { current_encoding_conversion = &encodings_list[encoding_index]; encoding_set = 1; free (global_input_encoding_name); global_input_encoding_name = strdup (encoding); } return encoding_set; } static INPUT *input_stack = 0; int input_number = 0; int input_space = 0; int macro_expansion_nr = 0; int value_expansion_nr = 0; /* Current filename and line number. Used for reporting. */ SOURCE_INFO current_source_info; /* Collect text from the input sources until a newline is found. This is used instead of next_text when we need to be sure we get an entire line of Texinfo input (for example as a line argument to a command), which might not be the case if the input is the result of a macro expansion. Return value should not be freed by caller, and becomes invalid after a subsequent call. */ /* CURRENT is the current container that can be used for source marks. */ char * new_line (ELEMENT *current) { static TEXT t; char *new = 0; t.end = 0; while (1) { new = next_text (current); if (!new) break; text_append (&t, new); free (new); if (t.text[t.end - 1] == '\n') break; } if (t.end > 0) return t.text; else return 0; } /* Run iconv using text buffer as output buffer. */ size_t text_buffer_iconv (TEXT *buf, iconv_t iconv_state, ICONV_CONST char **inbuf, size_t *inbytesleft) { size_t out_bytes_left; char *outptr; size_t iconv_ret; outptr = buf->text + buf->end; if (buf->end == buf->space - 1) { errno = E2BIG; return (size_t) -1; } out_bytes_left = buf->space - buf->end - 1; iconv_ret = iconv (iconv_state, inbuf, inbytesleft, &outptr, &out_bytes_left); buf->end = outptr - buf->text; return iconv_ret; } static char * encode_with_iconv (iconv_t our_iconv, char *s) { static TEXT t; ICONV_CONST char *inptr; size_t bytes_left; size_t iconv_ret; t.end = 0; /* reset internal TEXT buffer */ inptr = s; bytes_left = strlen (s); text_alloc (&t, 10); while (1) { iconv_ret = text_buffer_iconv (&t, our_iconv, &inptr, &bytes_left); /* Make sure libiconv flushes out the last converted character. This is required when the conversion is stateful, in which case libiconv might not output the last character, waiting to see whether it should be combined with the next one. */ if (iconv_ret != (size_t) -1 && text_buffer_iconv (&t, our_iconv, 0, 0) != (size_t) -1) /* Success: all of input converted. */ break; if (bytes_left == 0) break; switch (errno) { case E2BIG: text_alloc (&t, t.space + 20); break; case EILSEQ: default: fprintf(stderr, "%s:%d: encoding error at byte 0x%2x\n", current_source_info.file_name, current_source_info.line_nr, *(unsigned char *)inptr); inptr++; bytes_left--; break; } } t.text[t.end] = '\0'; return strdup (t.text); } /* Return conversion of S according to input_encoding. This function frees S. */ char * convert_to_utf8 (char *s) { char *ret; /* Convert from @documentencoding to UTF-8. It might be possible not to convert to UTF-8 and use an 8-bit encoding throughout, but then we'd have to not set the UTF-8 flag on the Perl strings in api.c. If multiple character encodings were used in a single file, then we'd have to keep track of which strings needed the UTF-8 flag and which didn't. */ if (current_encoding_conversion == 0) { /* In case the converter couldn't be initialised. Danger: this will cause problems if the input is not in UTF-8 as the Perl strings that are created are flagged as being UTF-8. */ return s; } ret = encode_with_iconv (current_encoding_conversion->iconv, s); free (s); return ret; } int doc_encoding_for_input_file_name = 1; char *input_file_name_encoding = 0; char *locale_encoding = 0; void set_input_file_name_encoding (char *value) { free (input_file_name_encoding); input_file_name_encoding = value ? strdup (value) : 0; } void set_locale_encoding (char *value) { free (locale_encoding); locale_encoding = value ? strdup (value) : 0; } /* Reverse the decoding of the filename to the input encoding, to retrieve the bytes that were present in the original Texinfo file. Return value is freed by free_small_strings. */ char * encode_file_name (char *filename) { if (!reverse_iconv) { if (input_file_name_encoding) { reverse_iconv = iconv_open (input_file_name_encoding, "UTF-8"); } else if (doc_encoding_for_input_file_name) { if (current_encoding_conversion && strcmp (global_input_encoding_name, "utf-8")) { char *conversion_encoding = current_encoding_conversion->encoding_name; reverse_iconv = iconv_open (conversion_encoding, "UTF-8"); } } else if (locale_encoding) { reverse_iconv = iconv_open (locale_encoding, "UTF-8"); } } if (reverse_iconv && reverse_iconv != (iconv_t) -1) { char *s, *conv; conv = encode_with_iconv (reverse_iconv, filename); s = save_string (conv); free (conv); return s; } else { return save_string (filename); } } /* Change the line number of filename of the top input source. Used to record a #line directive. */ void save_line_directive (int line_nr, char *filename) { char *f = 0; INPUT *top; if (filename) f = encode_file_name (filename); top = &input_stack[input_number - 1]; if (line_nr) top->source_info.line_nr = line_nr; if (filename) top->source_info.file_name = f; } int expanding_macro (char *macro) { int i; for (i = 0; i < input_number; i++) { if (input_stack[i].source_info.macro && !strcmp (input_stack[i].source_info.macro, macro)) { return 1; } } return 0; } char *save_string (char *string); void input_pushback (char *string) { if (input_pushback_string) fprintf (stderr, "texi2any (XS module): bug: input_pushback called twice\n"); input_pushback_string = string; } /* Return value to be freed by caller. Return null if we are out of input. */ /* CURRENT is the current container that can be used for source marks. */ char * next_text (ELEMENT *current) { ssize_t status; char *line = 0; size_t n = 1; /* Note: n needs to be a positive value, rather than 0, to work around a bug in getline on MinGW. This appears to be allowed by POSIX. */ FILE *input_file; if (input_pushback_string) { char *s; s = input_pushback_string; input_pushback_string = 0; return s; } while (input_number > 0) { /* Check for pending input. */ INPUT *input = &input_stack[input_number - 1]; switch (input->type) { char *p, *new; case IN_text: /* debug_nonl ("IN_TEXT '"); debug_print_protected_string (input->ptext); debug ("'"); */ if (!*input->ptext) break; /* Split off a line of input. */ p = strchrnul (input->ptext, '\n'); new = strndup (input->ptext, p - input->ptext + 1); if (*p) input->ptext = p + 1; else input->ptext = p; /* The next time, we will pop the input source. */ /* debug_nonl ("NEW IN_TEXT '"); debug_print_protected_string (new); debug_nonl ("' next: '"); debug_print_protected_string (input->ptext); debug ("'"); */ if (!input->source_info.macro && !input->value_flag) input->source_info.line_nr++; current_source_info = input->source_info; return new; break; case IN_file: input_file = input->file; status = getline (&line, &n, input_file); if (status != -1) { char *comment; if (feof (input_file)) { /* Add a newline at the end of the file if one is missing. */ char *line2; xasprintf (&line2, "%s\n", line); free (line); line = line2; } /* Strip off a comment. */ comment = strchr (line, '\x7F'); if (comment) { SOURCE_MARK *source_mark = new_source_mark (SM_type_delcomment); *comment = '\0'; if (*(comment+1) != '\0') source_mark->line = convert_to_utf8 (strdup (comment+1)); else source_mark->line = 0; input_push_text(strdup (""), input->source_info.line_nr, 0, 0); /* if the input_stack was reallocated in input_push_text, the input pointer for the file may have been freed and re-created at another address. Therefore we reset it. input_number has been increased too, so the input file being processed is now at input_number - 2 */ input = &input_stack[input_number - 2]; set_input_source_mark (source_mark); } input->source_info.line_nr++; current_source_info = input->source_info; return convert_to_utf8 (line); } free (line); line = 0; break; default: fatal ("unknown input source type"); } /* Top input source failed. Close, pop, and try the next one. */ if (input->type == IN_file) { FILE *file = input->file; if (file != stdin) { if (fclose (input->file) == EOF) { /* convert to UTF-8 for the messages, to have character strings in perl that will be encoded on output to the locale encoding. Done differently for the file names in source_info which are byte strings and end up unmodified in output error messages. */ char *decoded_file_name = convert_to_utf8 (strdup(input->input_file_path)); line_warn ("error on closing %s: %s", decoded_file_name, strerror (errno)); free (decoded_file_name); } } } else { /* End of text reached. */ free (input->text); if (input->value_flag) { value_expansion_nr--; free (input->value_flag); } else if (input->macro_name) { macro_expansion_nr--; } } if (input->input_source_mark) { if (current) { SOURCE_MARK *input_source_mark = input->input_source_mark; SOURCE_MARK *end_include_source_mark; if (input_source_mark->type == SM_type_delcomment) end_include_source_mark = input_source_mark; else { end_include_source_mark = new_source_mark (input_source_mark->type); end_include_source_mark->counter = input_source_mark->counter; end_include_source_mark->status = SM_status_end; } register_source_mark (current, end_include_source_mark); } else debug ("INPUT MARK MISSED"); input->input_source_mark = 0; } input_number--; } debug ("INPUT FINISHED"); return 0; } /* Store TEXT as a source for Texinfo content. TEXT should be a UTF-8 string. TEXT will be later free'd and must be allocated on the heap. MACRO_NAME is the name of the macro expanded as text. It should only be given if this is the text corresponds to a new macro expansion. If already within a macro expansion, but not from a macro expansion (from a value expansion, for instance), the macro name will be taken from the input stack. VALUE_FLAG is the name of the value flag expanded as text. VALUE_FLAG will be later free'd, but not MACRO_NAME. */ void input_push_text (char *text, int line_number, char *macro_name, char *value_flag) { char *filename = 0; char *in_macro = 0; if (!text) return; if (input_number == input_space) { input_space++; input_space *= 1.5; input_stack = realloc (input_stack, input_space * sizeof (INPUT)); if (!input_stack) fatal ("realloc failed"); } input_stack[input_number].type = IN_text; input_stack[input_number].file = 0; input_stack[input_number].input_file_path = 0; input_stack[input_number].text = text; input_stack[input_number].ptext = text; if (input_number > 0) { filename = input_stack[input_number - 1].source_info.file_name; /* context macro expansion */ in_macro = input_stack[input_number - 1].source_info.macro; } if (macro_name) { /* new macro expansion */ in_macro = macro_name; } if (!in_macro && !value_flag) line_number--; input_stack[input_number].source_info.line_nr = line_number; input_stack[input_number].source_info.file_name = save_string (filename); input_stack[input_number].source_info.macro = save_string (in_macro); input_stack[input_number].macro_name = save_string (macro_name); input_stack[input_number].value_flag = value_flag; input_stack[input_number].input_source_mark = 0; input_number++; } void set_input_source_mark (SOURCE_MARK *source_mark) { input_stack[input_number - 1].input_source_mark = source_mark; } /* For filenames and macro names, it is possible that they won't be referenced in the line number of any element. It would be too much work to keep track, so just keep them all here, and free them all together at the end. */ static char **small_strings; static size_t small_strings_num; static size_t small_strings_space; char * save_string (char *string) { char *ret = string ? strdup (string) : 0; if (ret) { if (small_strings_num == small_strings_space) { small_strings_space++; small_strings_space += (small_strings_space >> 2); small_strings = realloc (small_strings, small_strings_space * sizeof (char *)); if (!small_strings) fatal ("realloc failed"); } small_strings[small_strings_num++] = ret; } return ret; } /* Called in reset_parser. */ void free_small_strings (void) { size_t i; for (i = 0; i < small_strings_num; i++) { free (small_strings[i]); } small_strings_num = 0; } void input_reset_input_stack (void) { int i; for (i = 0; i < input_number; i++) { switch (input_stack[i].type) { case IN_file: if (input_stack[i].file != stdin) fclose (input_stack[i].file); break; case IN_text: free (input_stack[i].text); break; } } input_number = 0; macro_expansion_nr = 0; value_expansion_nr = 0; } void reset_encoding_list (void) { int i; /* never reset the utf-8 encoding in position 0 */ if (encoding_number > 1) { for (i = 1; i < encoding_number; i++) { free (encodings_list[i].encoding_name); if (encodings_list[i].iconv != (iconv_t) -1) iconv_close (encodings_list[i].iconv); } encoding_number = 1; } /* could be named global_encoding_conversion and reset in wipe_global_info, but we prefer to keep it static as long as it is only used in one file */ current_encoding_conversion = 0; } int top_file_index (void) { int i = input_number - 1; while (i >= 0 && input_stack[i].type != IN_file) i--; return i; } static char **include_dirs; static size_t include_dirs_number; static size_t include_dirs_space; void add_include_directory (char *filename) { int len; if (include_dirs_number == include_dirs_space) { include_dirs = realloc (include_dirs, sizeof (char *) * (include_dirs_space += 5)); } filename = strdup (filename); include_dirs[include_dirs_number++] = filename; len = strlen (filename); if (len > 0 && filename[len - 1] == '/') filename[len - 1] = '\0'; } void clear_include_directories (void) { int i; for (i = 0; i < include_dirs_number; i++) { free (include_dirs[i]); } include_dirs_number = 0; } /* Return value to be freed by caller. */ char * locate_include_file (char *filename) { char *fullpath; struct stat dummy; int i, status; /* Checks if filename is absolute or relative to current directory. */ /* Note: the Perl code (in Common.pm, 'locate_include_file') handles a volume in a path (like "A:") using the File::Spec module. */ if (!memcmp (filename, "/", 1) || !memcmp (filename, "../", 3) || !memcmp (filename, "./", 2)) { status = stat (filename, &dummy); if (status == 0) return strdup (filename); } else { for (i = 0; i < include_dirs_number; i++) { xasprintf (&fullpath, "%s/%s", include_dirs[i], filename); status = stat (fullpath, &dummy); if (status == 0) return fullpath; free (fullpath); } } return 0; } /* Try to open a file called FILENAME, looking for it in the list of include directories. */ int input_push_file (char *filename) { FILE *stream = 0; char *p, *q; char *base_filename; if (!strcmp (filename, "-")) stream = stdin; else { stream = fopen (filename, "r"); if (!stream) return errno; } if (input_number == input_space) { input_stack = realloc (input_stack, (input_space += 5) * sizeof (INPUT)); if (!input_stack) fatal ("realloc failed"); } /* Strip off a leading directory path. */ p = 0; q = strchr (filename, '/'); while (q) { p = q; q = strchr (q + 1, '/'); } if (p) base_filename = save_string (p+1); else base_filename = save_string (filename); input_stack[input_number].type = IN_file; input_stack[input_number].file = stream; input_stack[input_number].input_file_path = filename; input_stack[input_number].source_info.file_name = base_filename; input_stack[input_number].source_info.line_nr = 0; input_stack[input_number].source_info.macro = 0; input_stack[input_number].input_source_mark = 0; input_stack[input_number].text = 0; input_stack[input_number].ptext = 0; input_number++; return 0; }