From: Ralph Corderoy Date: Sun, 6 Aug 2017 17:15:41 +0000 (+0100) Subject: cpstripped(): Rewrite multi-byte version. X-Git-Url: https://diplodocus.org/git/nmh/commitdiff_plain/a091c28b416f83aa46dcb0bd0da81ca1d3b8a7f9?hp=8ba2c2e0a5c0e2c087cd1684a885b959c101e010 cpstripped(): Rewrite multi-byte version. Removes the assert(3) failure reported on the list for 1.7-RC1 when a non-space, non-cntrl, rune has a wcwidth(3) of -1; output L'?' in its place. The old code also didn't reset mbtowc(3)'s state before trying to parse "?"; that's handled differently now so isn't an issue. Pad with the multi-byte encoding of L" ", having ensured the wcwidth(1) of L' ' is one, rather than with a non-wchar_t ' '. Point out padding only occurs in one particular case, and not the other two; quite odd. --- diff --git a/sbr/fmt_scan.c b/sbr/fmt_scan.c index e5e1ff85..8e59669d 100644 --- a/sbr/fmt_scan.c +++ b/sbr/fmt_scan.c @@ -222,75 +222,87 @@ cptrimmed(charstring_t dest, char *str, int wid, char fill, size_t max) { static void cpstripped (charstring_t dest, size_t max, char *str) { - int prevCtrl = 1; /* This is 1 so we strip out leading spaces */ - int len; - int char_len, w; - wchar_t wide_char; - char *altstr = NULL; + static bool deja_vu; + static char *oddchar; + static size_t oddlen; + static char *spacechar; + static size_t spacelen; + char *end; + bool squash; + char *src; + int srclen; + wchar_t rune; + int w; - if (!str) { - return; - } + if (!deja_vu) { + size_t two; - len = strlen(str); + deja_vu = true; - if (mbtowc(NULL, NULL, 0)) {} /* Reset shift state */ + two = MB_CUR_MAX * 2; /* Varies at run-time. */ - /* - * Process each character at a time; if we have multibyte support - * then deal with that here. - */ + oddchar = mh_xmalloc(two); + oddlen = wcstombs(oddchar, L"?", two); + assert(oddlen > 0); - while (*str != '\0' && len > 0 && max > 0) { - char_len = mbtowc(&wide_char, str, len); - - /* - * If mbrtowc() failed, then we have a character that isn't valid - * in the current encoding, or len wasn't enough for the whole - * multi-byte rune to be read. Replace it with a '?'. We do that by - * setting the alstr variable to the value of the replacement string; - * altstr is used below when the bytes are copied into the output - * buffer. - */ - if (char_len < 0) { - altstr = "?"; - char_len = mbtowc(&wide_char, altstr, 1); - } + assert(wcwidth(L' ') == 1); /* Need to pad in ones. */ + spacechar = mh_xmalloc(two); + spacelen = wcstombs(spacechar, L" ", two); + assert(spacelen > 0); + } - if (char_len <= 0) { - break; - } + if (!str) + return; /* It's unclear why no padding in this case. */ + end = str + strlen(str); + + if (mbtowc(NULL, NULL, 0)) + {} /* Reset shift state. */ + + squash = true; /* Trim `space' or `cntrl' from the start. */ + while (max) { + if (!*str) + return; /* It's unclear why no padding in this case. */ + + srclen = mbtowc(&rune, str, end - str); + if (srclen == -1) { + /* Invalid rune, or not enough bytes to finish it. */ + rune = L'?'; + src = oddchar; + srclen = oddlen; + str++; /* Skip one byte. */ + } else { + src = str; + str += srclen; + } - len -= char_len; + if (iswspace(rune) || iswcntrl(rune)) { + if (squash) + continue; /* Amidst a run of these. */ + rune = L' '; + src = spacechar; + srclen = spacelen; + squash = true; + } else + squash = false; - if (iswcntrl(wide_char) || iswspace(wide_char)) { - str += char_len; - if (! prevCtrl) { - charstring_push_back (dest, ' '); - --max; - } + w = wcwidth(rune); + if (w == -1) { + rune = L'?'; + w = wcwidth(rune); + assert(w != -1); + src = oddchar; + srclen = oddlen; + } - prevCtrl = 1; - continue; - } + if ((size_t)w > max) { + /* No room for rune; pad. */ + while (max--) + charstring_push_back_chars(dest, spacechar, spacelen, 1); + return; + } - prevCtrl = 0; - - w = wcwidth(wide_char); - assert(w >= 0); - if (max >= (size_t) w) { - charstring_push_back_chars (dest, altstr ? altstr : str, char_len, w); - max -= w; - str += char_len; - altstr = NULL; - } else { - /* Not enough width available for the last character. Output - space(s) to fill. */ - while (max-- > 0) { - charstring_push_back (dest, ' '); - } - break; - } + charstring_push_back_chars(dest, src, srclen, w); + max -= w; } } #endif