/* Copyright 2010-2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #ifndef _WIN32 #include #else /* _WIN32 */ /* Workaround for problems caused in mingw.org's MinGW build by Gnulib's wchar.h overriding the wint_t type definition, which causes compilation errors when perl.h is included below, because perl.h includes ctype.h. */ #include #endif #include #include /* See "How do I use all this in extensions" in 'man perlguts'. */ #define PERL_NO_GET_CONTEXT #include "EXTERN.h" #include "perl.h" #if defined _WIN32 && !defined __CYGWIN__ # undef free #endif #include "XSUB.h" #include "ppport.h" #include "xspara.h" #include "text.h" int debug = 0; typedef struct { TEXT space; /* Pending space, to be output before the pending word. */ TEXT word; /* Pending word. If outputting this would have led to the line to be too long, the line should have been cut before saving it. */ /* When word.end == 0, this indicates a word of length 0. */ int invisible_pending_word; /* Length of space in multibyte characters. */ int space_counter; /* Characters added so far in current word. */ int word_counter; /* -2 means we are not at the end of a sentence (undefined in Perl), 1 means we are at the end of a sentence and French spacing is off, -1 means we are at the end of a sentence and French spacing is on. 0 means it is "inhibited". */ int end_sentence; int max; /* Maximum length of line. */ int indent_length; /* Columns to indent this line. */ int indent_length_next; /* Columns to indent the rest of the lines. */ int counter; /* Columns so far on this line. */ int lines_counter; /* Lines so far added in paragraph. */ int end_line_count; /* Number of newlines so far in an output unit, i.e. with add_text or add_next. */ wint_t last_letter; /* Last letter in word, used to decide if we're at the end of a sentence. */ /* Options set with set_space_protection. */ int no_break; /* Line break forbidden, as in @w. */ int ignore_columns; /* Don't cut line at right margin. Used by @flushleft and @flushright. */ int keep_end_lines; /* A newline in the input ends a line in the output. Used by @flushleft and @flushright. */ int french_spacing; /* Only one space, not two, after a full stop. */ int double_width_no_break; /* No line break between double width chars. */ /* No wrapping of lines and spaces are kept as-is. */ int unfilled; /* Do not terminate with a final newline. */ int no_final_newline; /* Terminate with any trailing space. */ int add_final_space; int in_use; } PARAGRAPH; static PARAGRAPH state; #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #include #include /* If Gnulib overrides wint_t with a wider type, we cannot use iswspace etc. names, whose prototypes were seen with the original wint_t in effect. */ #ifdef GNULIB_defined_wint_t # undef iswspace # define iswspace(w) w32_iswspace(w) # undef iswupper # define iswupper(w) w32_iswupper(w) #endif char * w32_setlocale (int category, const char *value) { if (_stricmp (value, "en_us.utf-8") != 0) return NULL; /* Switch to the Windows U.S. English locale with its default codeset. We will handle the non-ASCII text ourselves, so the codeset is unimportant, and Windows doesn't support UTF-8 as the codeset anyway. */ return setlocale (category, "ENU"); } #define setlocale(c,v) w32_setlocale(c,v) size_t mbrlen (const char * __restrict__ mbs, size_t n, mbstate_t * __restrict__ ps) { unsigned char byte1 = *mbs; if (ps != NULL) { errno = ENOSYS; return -1; } return ((byte1 & 0x80) == 0) ? 1 : ((byte1 & 0x20) == 0) ? 2 : ((byte1 & 0x10) == 0) ? 3 : 4; } /* Convert a UTF-8 encoded multibyte string to a wide character. */ size_t mbrtowc (wchar_t * __restrict__ pwc, const char * __restrict__ mbs, size_t n, mbstate_t * __restrict__ ps) { int len = mbrlen (mbs, n, ps); if (mbs == NULL) return 0; else { wchar_t wc[2]; size_t n_utf16 = MultiByteToWideChar (CP_UTF8, MB_ERR_INVALID_CHARS, mbs, len, wc, 2); if (n_utf16 == 0) { errno = EILSEQ; return (size_t)-1; } if (ps != NULL) { errno = ENOSYS; return (size_t)-1; } /* We don't support UTF-16 surrogates, because the calling code doesn't, and because character classification functions on Windows don't support anything beyond the BMP anyway. So we return the first character of the surrogate pair and set errno. */ if (n_utf16 > 1) errno = ENOSYS; if (pwc != NULL) *pwc = wc[0]; return len; } } /* NOTE - not used at present */ int iswspace (wint_t wc) { /* See Unicode's Proplist.txt. */ if ((wc >= 0x09 && wc <= 0x0D) || wc == 0x20 || wc == 0x85 || wc == 0xA0 || wc == 0x1680 || (wc >= 0x2000 && wc <= 0x200A) || wc == 0x2028 || wc == 0x2029 || wc == 0x202F || wc == 0x205F || wc == 0x3000) return 1; return 0; } int iswupper (wint_t wi) { WORD char_type; wchar_t wc = wi; BOOL status = GetStringTypeW (CT_CTYPE1, &wc, 1, &char_type); if (!status || (char_type & C1_UPPER) == 0) return 0; return 1; } /* Avoid warnings due to redefinition of popen/pclose in Perl headers. */ #ifdef popen # undef popen # define popen(c,m) _popen(c,m) #endif #ifdef pclose # undef pclose # define pclose(f) _pclose(f) #endif #endif /* for debug */ char * xspara__print_escaped_spaces (char *string) { static TEXT t; char *p = string; text_reset (&t); while (*p) { if (*p == ' ') text_append_n (&t, p, 1); else if (*p == '\n') text_append_n (&t, "\\n", 2); else if (*p == '\f') text_append_n (&t, "\\f", 2); else if (isspace(*p)) { char protected_string[7]; sprintf (protected_string, "\\x%04x", *p); text_append (&t, protected_string); } p++; } return t.text; } int xspara_init (int unused, char *unused2) { char *utf8_locale = 0; int len; char *cur; char *dot; dTHX; #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8) /* needed due to thread-safe locale handling in newer perls */ switch_to_global_locale(); #endif if (setlocale (LC_CTYPE, "en_US.UTF-8") || setlocale (LC_CTYPE, "en_US.utf8")) goto success; cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */ if (!cur) goto failure; len = strlen (cur); if ((len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6)) || (len >= 5 && !memcmp (".utf8", cur + len - 5, 5)) || (len >= 6 && !memcmp (".utf-8", cur + len - 6, 6)) || (len >= 5 && !memcmp (".UTF8", cur + len - 5, 5))) { setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */ goto success; } /* Otherwise try altering the current locale name. */ dot = strchr (cur, '.'); if (!dot) dot = cur + len; utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */ memcpy (utf8_locale, cur, dot - cur); dot = utf8_locale + (dot - cur); memcpy (dot, ".UTF-8", 7); if (setlocale (LC_CTYPE, utf8_locale)) goto success; memcpy (dot, ".utf8", 6); if (setlocale (LC_CTYPE, utf8_locale)) goto success; /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */ { FILE *p; char *line = 0; size_t n = 0; ssize_t ret; p = popen ("locale -a", "r"); if (!p) goto failure; while (1) { ret = getline (&line, &n, p); if (ret == (ssize_t) -1) { free (line); pclose (p); goto failure; } if (strstr (line, "UTF-8") || strstr (line, "utf8")) { line[ret - 1] = '\0'; /* Remove trailing newline. */ if (setlocale (LC_CTYPE, line)) { free (line); pclose (p); goto success; } } } } if (1) { failure: return 0; /* failure */ } else { success: ; free (utf8_locale); #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8) /* needed due to thread-safe locale handling in newer perls */ sync_locale(); #endif /* fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n"); fprintf (stderr, "character encoding is: %s\n", nl_langinfo (CODESET)); */ return 1; /* success */ } } /* Array for storing paragraph states which aren't in use. */ static PARAGRAPH *state_array; static int state_array_size; /* The slot in state_array for saving the current state. */ static int current_state; static void xspara__switch_state (int id) { if (current_state == id) return; if (current_state != -1) memcpy (&state_array[current_state], &state, sizeof (PARAGRAPH)); memcpy (&state, &state_array[id], sizeof (PARAGRAPH)); current_state = id; } int xspara_new (HV *conf) { int i; dTHX; /* Perl boiler plate */ TEXT saved_space, saved_word; /* Find an unused slot in state_array */ for (i = 0; i < state_array_size; i++) { if (!state_array[i].in_use) break; } if (i == state_array_size) { state_array = realloc (state_array, (state_array_size += 10) * sizeof (PARAGRAPH)); memset (state_array + i, 0, 10 * sizeof (PARAGRAPH)); } state_array[i].in_use = 1; xspara__switch_state (i); /* Zero formatter, reusing storage. */ saved_space = state.space; saved_word = state.word; memset (&state, 0, sizeof (state)); state.space = saved_space; state.word = saved_word; state.space.end = state.word.end = 0; state.in_use = 1; /* Default values. */ state.max = 72; state.indent_length_next = -1; /* Special value meaning undefined. */ state.end_sentence = -2; /* Special value meaning undefined. */ state.last_letter = L'\0'; if (conf) xspara_init_state (conf); /* The paragraph ID. */ return i; } /* SV is a blessed reference to an integer containing the paragraph ID. */ void xspara_set_state (SV *sv) { dTHX; xspara__switch_state (SvIV (sv)); } /* Set the state internal to this C module from the Perl hash. */ void xspara_init_state (HV *hash) { #define FETCH(key) hv_fetch (hash, key, strlen (key), 0) #define FETCH_INT(key,where) { val = FETCH(key); \ if (val) { where = SvIV (*val); } } SV **val; dTHX; /* This is boilerplate for interacting with Perl. */ /* Fetch all these so they are set, and reset for each paragraph. */ FETCH_INT("end_sentence", state.end_sentence); FETCH_INT("max", state.max); FETCH_INT("indent_length", state.indent_length); FETCH_INT("indent_length_next", state.indent_length_next); FETCH_INT("counter", state.counter); FETCH_INT("word_counter", state.word_counter); FETCH_INT("lines_counter", state.lines_counter); FETCH_INT("end_line_count", state.end_line_count); FETCH_INT("no_break", state.no_break); FETCH_INT("ignore_columns", state.ignore_columns); FETCH_INT("keep_end_lines", state.keep_end_lines); FETCH_INT("frenchspacing", state.french_spacing); FETCH_INT("unfilled", state.unfilled); FETCH_INT("no_final_newline", state.no_final_newline); FETCH_INT("add_final_space", state.add_final_space); val = FETCH("word"); if (val) { fprintf (stderr, "Bug: setting 'word' is not supported.\n"); abort (); } val = FETCH("space"); if (val) { fprintf (stderr, "Bug: setting 'space' is not supported.\n"); abort (); } return; #undef FETCH #undef FETCH_INT } /************************************************************************/ /* Append a newline character to RESULT. */ void xspara__cut_line (TEXT *result) { if (!state.ignore_columns) { xspara__end_line (); text_append (result, "\n"); } } int xspara_end_line_count (void) { return state.end_line_count; } int xspara_counter (void) { return state.counter; } /* End a line (throwing away a pending space, which we don't need) Note _end_line in Paragraph.pm returned "\n". */ void xspara__end_line (void) { state.counter = 0; state.space.end = 0; state.space_counter = 0; /* This will only be true for the first line of output. */ if (state.indent_length_next != -1) { state.indent_length = state.indent_length_next; state.indent_length_next = -1; } state.lines_counter++; state.end_line_count++; /* could be set to other values, anything that is not upper case. */ state.last_letter = L'\n'; } char * xspara_end_line (void) { state.end_line_count = 0; xspara__end_line (); return "\n"; } /* Return concatenation of SPACE and WORD. */ char * xspara_get_pending (void) { static TEXT t; text_reset (&t); text_append_n (&t, state.space.text, state.space.end); text_append_n (&t, state.word.text, state.word.end); return t.text; } /* Append to RESULT pending space followed by pending word, clearing them afterwards. Assume we don't need to wrap a line. Only add spaces without a word if ADD_SPACES. */ void xspara__add_pending_word (TEXT *result, int add_spaces) { dTHX; if (state.word.end == 0 && !state.invisible_pending_word && !add_spaces) return; if (state.indent_length > state.counter) { int i; /* If we are not up to the left margin yet, output spaces to get there, and ignore 'state.space', the pending space string. In this case state.counter is probably 0. */ for (i = 0; i < state.indent_length - state.counter; i++) text_append (result, " "); state.counter = state.indent_length; if (debug) fprintf (stderr, "INDENT(%d+%d)\n", state.counter, state.word_counter); /* Do not output leading spaces after the indent, unless 'unfilled' is on. */ if (!state.unfilled) { state.space.end = 0; state.space_counter = 0; } } if (state.space.end > 0) { text_append_n (result, state.space.text, state.space.end); state.counter += state.space_counter; if (debug) fprintf (stderr, "ADD_SPACES(%d+%d)\n", state.counter, state.word_counter); state.space.end = 0; state.space_counter = 0; } if (state.word.end > 0 || state.invisible_pending_word) { text_append_n (result, state.word.text, state.word.end); state.counter += state.word_counter; if (debug) fprintf (stderr, "ADD_WORD[%s]+%d (%d)\n", state.word.text, state.word_counter, state.counter); state.word.end = 0; state.word_counter = 0; state.invisible_pending_word = 0; } } /* Function for users of this module. */ char * xspara_add_pending_word (int add_spaces) { static TEXT ret; text_reset (&ret); state.end_line_count = 0; xspara__add_pending_word (&ret, add_spaces); if (ret.text) return ret.text; else return ""; } /* End a paragraph. */ char * xspara_end (void) { static TEXT ret; dTHX; text_reset (&ret); state.end_line_count = 0; if (debug) fprintf (stderr, "PARA END\n"); /* probably not really useful, but cleaner */ state.last_letter = L'\0'; xspara__add_pending_word (&ret, state.add_final_space); if (!state.no_final_newline && state.counter != 0) { text_append (&ret, "\n"); state.lines_counter++; state.end_line_count++; } /* Now it's time to forget about the state. */ state_array[current_state].in_use = 0; state.in_use = 0; /* Don't do this so we can get the closing line counts. */ /* current_state = -1; */ if (ret.text) return ret.text; else return ""; } /* check if a byte is in the printable ASCII range */ #define PRINTABLE_ASCII(c) (0x20 <= (c) && (c) <= 0x7E) /* ignored after end sentence character to determine if at the end of a sentence */ #define after_punctuation_characters "\"')]" /* characters triggering an end of sentence */ #define end_sentence_characters ".?!" /* Add WORD to paragraph in RESULT, not refilling WORD. If we go past the end of the line start a new one. TRANSPARENT means that the letters in WORD are ignored for the purpose of deciding whether a full stop ends a sentence or not. */ void xspara__add_next (TEXT *result, char *word, int word_len, int transparent) { dTHX; int disinhibit = 0; if (!word) return; if (word_len >= 1 && word[word_len - 1] == '\b') { word[--word_len] = '\0'; disinhibit = 1; } text_append_n (&state.word, word, word_len); if (word_len == 0 && word) state.invisible_pending_word = 1; if (!transparent) { if (disinhibit) state.last_letter = L'a'; /* a lower-case letter */ else { /* Save last character in WORD */ char *p = word + word_len; while (p > word) { int len = 0; /* Back one UTF-8 code point */ do { p--; len++; } while ((*p & 0xC0) == 0x80 && p > word); if (!strchr (end_sentence_characters after_punctuation_characters, *p)) { if (!PRINTABLE_ASCII(*p)) { wchar_t wc = L'\0'; mbrtowc (&wc, p, len, NULL); state.last_letter = wc; break; } else { state.last_letter = btowc (*p); break; } } } } } if (strchr (word, '\n')) { /* If there was a newline in the word we just added, put the entire pending ouput in the results string, and start a new line. */ xspara__add_pending_word (result, 0); xspara__end_line (); } else { /* Calculate length of multibyte string in characters. */ int len = 0; int left = word_len; wchar_t w; char *p = word; while (left > 0) { int columns; int char_len; if (PRINTABLE_ASCII(*p)) { len++; p++; left--; continue; } char_len = mbrtowc (&w, p, left, NULL); if (char_len == (size_t) -2) { /* unfinished multibyte character */ char_len = left; } else if (char_len == (size_t) -1) { /* invalid character */ char_len = 1; } else if (char_len == 0) { /* not sure what this means but we must avoid an infinite loop. Possibly only happens with invalid strings */ char_len = 1; } left -= char_len; columns = wcwidth (w); if (columns > 0) len += columns; p += char_len; } state.word_counter += len; if (state.counter != 0 && state.counter + state.word_counter + state.space_counter > state.max) { xspara__cut_line (result); } } if (debug) fprintf (stderr, "WORD+ %s -> %s\n", word, state.word.space == 0 ? "UNDEF" : state.word.text); } /* Like _add_next but zero end_line_count at beginning. */ TEXT xspara_add_next (char *text, int text_len, int transparent) { static TEXT t; text_reset (&t); state.end_line_count = 0; xspara__add_next (&t, text, text_len, transparent); return t; } void xspara_remove_end_sentence (void) { state.end_sentence = 0; } void xspara_add_end_sentence (int value) { state.end_sentence = value; } void xspara_allow_end_sentence (void) { state.last_letter = L'a'; /* A lower-case letter. */ } /* -1 in a parameter means leave that value as it is. */ void xspara_set_space_protection (int no_break, int ignore_columns, int keep_end_lines, int french_spacing, int double_width_no_break) { if (no_break != -1) state.no_break = no_break; if (ignore_columns != -1) state.ignore_columns = ignore_columns; if (keep_end_lines != -1) state.keep_end_lines = keep_end_lines; if (double_width_no_break != -1) state.double_width_no_break = double_width_no_break; if (french_spacing != -1) state.french_spacing = french_spacing; /*fprintf (stderr, "SETTING SPACE (%d, %d, %d, %d)\n", no_break, ignore_columns, keep_end_lines, french_spacing);*/ if (no_break != -1 && state.no_break) { if (state.word.end == 0) { /* In _add_pending_word this meant that an "empty word" would be output. This makes "a @w{} b" -> "a b", not "a b", and "a @w{}" at end of paragraph -> "a ", not "a". */ state.invisible_pending_word = 1; } } return; } /*****************************************************************/ /* Return string to be added to paragraph contents, wrapping text. This function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to work correctly. */ TEXT xspara_add_text (char *text, int len) { char *p = text; wchar_t wc; size_t char_len; int width; static TEXT result; dTHX; text_reset (&result); state.end_line_count = 0; while (len > 0) { if (debug) { fprintf(stderr, "p (%d+%d) s `%s', l `%lc', w `%s'\n", state.counter, state.word_counter, state.space.end == 0 ? "" : xspara__print_escaped_spaces (state.space.text), state.last_letter, state.word.end > 0 ? state.word.text : "UNDEF"); } if (isspace ((unsigned char) *p)) { if (debug) { char t[2]; t[0] = *p; t[1] = '\0'; fprintf(stderr, "SPACES(%d) `%s'\n", state.counter, xspara__print_escaped_spaces (t)); } if (state.unfilled) { xspara__add_pending_word (&result, 0); if (*p == '\n') { xspara__end_line (); text_append (&result, "\n"); } else { text_append_n (&state.space, p, 1); state.space_counter++; } } else if (state.no_break) { /* Append the spaces to the pending word. */ if (state.word.end == 0 || state.word.text[state.word.end - 1] != ' ') { if (state.end_sentence == 1 && !state.french_spacing) { text_append_n (&state.word, " ", 2); state.word_counter += 2; } else { text_append_n (&state.word, " ", 1); state.word_counter += 1; } if (state.counter != 0 && state.counter + state.word_counter + state.space_counter > state.max) { xspara__cut_line (&result); } } } else /* no_break off */ { int pending = state.invisible_pending_word; xspara__add_pending_word (&result, 0); if (state.counter != 0 || pending) { /* If we are at the end of a sentence where two spaces are required. */ if (state.end_sentence == 1 && !state.french_spacing) { state.space.end = 0; text_append_n (&state.space, " ", 2); state.space_counter = 2; } else /* Not at end of sentence. */ { /* Only save the first space. */ if (state.space_counter < 1) { if (*p == '\n') { text_append_n (&state.space, " ", 1); state.space_counter++; } else { text_append_n (&state.space, p, 1); state.space_counter++; } } } } } /* If not enough space in the line for the pending space, start a new line. */ if (state.counter + state.space_counter > state.max) { xspara__cut_line (&result); } if (!state.unfilled && *p == '\n' && state.keep_end_lines) { xspara__end_line (); text_append (&result, "\n"); } p++; len--; state.last_letter = ' '; continue; } /************** Not a white space character. *****************/ if (!PRINTABLE_ASCII(*p)) { char_len = mbrtowc (&wc, p, len, NULL); } else { /* Functonally the same as mbrtowc but (tested) slightly quicker. */ char_len = 1; wc = btowc (*p); } if ((long) char_len == 0) break; /* Null character. Shouldn't happen. */ else if ((long) char_len < 0) { p++; len--; /* Invalid. Just try to keep going. */ continue; } width = wcwidth (wc); /*************** Double width character. *********************/ if (width == 2) { if (debug) fprintf (stderr, "FULLWIDTH\n"); text_append_n (&state.word, p, char_len); state.word_counter += 2; /* fullwidth latin letters can be upper case, so it is important to use the actual characters here. */ state.last_letter = wc; /* We allow a line break in between Chinese characters even if there was no space between them, unlike single-width characters. */ if (state.counter != 0 && state.counter + state.word_counter > state.max) { xspara__cut_line (&result); } /* Accumulate the characters so that they can be pushed onto the next line if necessary. */ if (!state.no_break && !state.double_width_no_break) { xspara__add_pending_word (&result, 0); } state.end_sentence = -2; } else if (wc == L'\b') { /* Code to say that a following full stop (or question or exclamation mark) may be an end of sentence. */ xspara_allow_end_sentence (); } /*************** Word character ******************************/ /* Note: width == 0 includes accent characters which should not properly increase the column count. This is not what the pure Perl code does, though. */ else if (width == 1 || width == 0) { static char added_word[8]; /* long enough for one UTF-8 character */ memcpy (added_word, p, char_len); added_word[char_len] = '\0'; xspara__add_next (&result, added_word, char_len, 0); /* Now check if it is considered as an end of sentence, and set state.end_sentence if it is. */ if (strchr (end_sentence_characters, *p) && !state.unfilled) { /* Doesn't count if preceded by an upper-case letter. */ if (!iswupper (state.last_letter)) { if (state.french_spacing) state.end_sentence = -1; else state.end_sentence = 1; if (debug) fprintf (stderr, "END_SENTENCE\n"); } } else if (strchr (after_punctuation_characters, *p)) { /* '"', '\'', ']' and ')' are ignored for the purpose of deciding whether a full stop ends a sentence. */ } else { /* Otherwise reset the end of sentence marker: a full stop in a string like "aaaa.bbbb" doesn't mark an end of sentence. */ state.last_letter = wc; if (debug && state.end_sentence != -2) fprintf (stderr, "delete END_SENTENCE(%d)\n", state.end_sentence); state.end_sentence = -2; } } else { /* Not printable, possibly a tab, or a combining character. Add it to the pending word without increasing the column count. */ text_append_n (&state.word, p, char_len); } p += char_len; len -= char_len; } return result; }