gout

A static git page generator
git clone https://git.bracken.jp/gout.git
Log | Files | Refs | README | LICENSE

commit e910caa7621acc383c9ee55ecd04ca5207b20970
parent f78b244670f76569d46dd8292f292cd43555b82c
Author: Chris Bracken <chris@bracken.jp>
Date:   Fri, 13 Feb 2026 15:52:00 +0900

Rewrite print_gopher_link_padded

Use wchar API to determine char widths, with a bit of tweaking for
complex emoji sequences.

Diffstat:
Msrc/format.c | 175++++++++++++++++++++++++++++++++++---------------------------------------------
Msrc/format_tests.c | 8++++----
Msrc/gout_tests_main.c | 8+++++++-
3 files changed, 86 insertions(+), 105 deletions(-)

diff --git a/src/format.c b/src/format.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include <time.h> +#include <wchar.h> void print_time(FILE* out, time_t time, int timezone_offset) { // Reject any offset > 24 hours. @@ -157,118 +158,92 @@ void print_gopher_link(FILE* out, const char* str) { } } -static size_t utf8_char_length(const char* str) { - unsigned char c = (unsigned char)*str; - if (c < 0x80) { - return 1; - } - if ((c & 0xe0) == 0xc0) { - return 2; - } - if ((c & 0xf0) == 0xe0) { - return 3; - } - if ((c & 0xf8) == 0xf0) { - return 4; - } - // Invalid UTF-8 sequence. Treat as single byte. - return 1; -} - -static bool is_zwj(const char* str) { - return str[0] == '\xe2' && str[1] == '\x80' && str[2] == '\x8d'; -} - -static bool is_variation_selector(const char* str) { - return str[0] == '\xef' && str[1] == '\xb8' && str[2] == '\x8f'; -} - -static size_t get_char_width(wchar_t c) { - // ASCII characters in the range [0-31] and [127-255] are control characters - // or non-printable characters - if ((c >= 0 && c <= 31) || (c >= 127 && c <= 255)) { - return 1; - } - - // Emoji range. - if (c >= 0x1F300 && c <= 0x1F6FF) { - return 2; - } - - // Extended pictographic characters. - if (c >= 0x1F900 && c <= 0x1F9FF) { - return 2; - } - - // Hiragana and Katakana ranges. - if ((c >= 0x3040 && c <= 0x309F) || (c >= 0x30A0 && c <= 0x30FF)) { - return 2; - } - - // Kanji range. - if (c >= 0x4E00 && c <= 0x9FFF) { - return 2; - } - return 1; -} - -// TODO(cbracken): There has to be a better way. void print_gopher_link_padded(FILE* out, const char* str, size_t width, char pad_char) { - size_t str_len = strlen(str); - size_t char_count = 0; + if (width == 0) + return; + + size_t display_width = 0; size_t last_char_width = 0; - size_t byte_count = 0; - while (byte_count < str_len) { - size_t pos = byte_count; - size_t char_length = utf8_char_length(&str[byte_count]); - byte_count += char_length; - if (char_count == width - 1 && byte_count < str_len) { + mbstate_t state; + + const char* ptr = str; + size_t len = strlen(str); + const char* end = str + len; + + while (ptr < end) { + wchar_t wc; + size_t bytes = mbrtowc(&wc, ptr, end - ptr, &state); + if (bytes == 0) + break; + + if (bytes == (size_t)-1 || bytes == (size_t)-2) { + // Invalid (-1) or incomplete (-2) UTF-8. Consume 1 byte. + if (display_width == width - 1 && ptr + 1 < end) { + fprintf(out, "\u2026"); + display_width++; + break; + } else if (display_width < width) { + fprintf(out, "%c", *ptr); + display_width++; + last_char_width = 1; + } + ptr++; + memset(&state, 0, sizeof(state)); + continue; + } + + int w = wcwidth(wc); + size_t char_width = (w < 0) ? 0 : w; + + // Print ellipsis if one character from max width but more remains. + if (display_width == width - 1 && ptr + bytes < end) { fprintf(out, "\u2026"); + display_width++; + break; + } + + // Stop if adding this character exceeds the max width. + if (display_width + char_width > width) { + break; + } + + if (wc == L'|') { + fprintf(out, "\\|"); + display_width++; last_char_width = 1; - char_count += last_char_width; - } else if (char_count < width) { - if (str[pos] == '|') { - fprintf(out, "\\|"); - char_count++; - last_char_width = 1; - } else if (str[pos] == '\t') { - for (size_t i = 0; i < 8 && char_count < width; i++) { - fprintf(out, " "); - char_count++; - } - last_char_width = 1; - } else if (str[pos] == '\r' || str[pos] == '\n') { - // Ignore. - last_char_width = 0; - } else if (char_length == 3 && is_zwj(&str[pos])) { - fprintf(out, "\xe2\x80\x8d"); - char_count -= last_char_width; - last_char_width = 0; - } else if (char_length == 3 && is_variation_selector(&str[pos])) { - fprintf(out, "\xef\xb8\x8f"); - char_count -= last_char_width; - last_char_width = 0; - } else { - for (size_t i = pos; i < byte_count; i++) { - fprintf(out, "%c", str[i]); - } - wchar_t wide_char; - if (mbtowc(&wide_char, &str[pos], MB_CUR_MAX) >= 0) { - last_char_width = get_char_width(wide_char); - } else { - last_char_width = 1; - } - char_count += last_char_width; + } else if (wc == L'\t') { + for (size_t i = 0; i < 8 && display_width < width; i++) { + fprintf(out, " "); + display_width++; } + last_char_width = 1; + } else if (wc == L'\r' || wc == L'\n') { + // Ignore. + last_char_width = 0; + } else { + // Hack: handle zero-width joiner and variation selector. + // wcwidth lacks awareness of complex emoji modifier sequences. + if (wc == 0x200D || (wc >= 0xFE00 && wc <= 0xFE0F)) { + display_width -= last_char_width; + char_width = 0; + } + + fprintf(out, "%.*s", (int)bytes, ptr); + display_width += char_width; + last_char_width = char_width; } + + ptr += bytes; } + + // Pad remainder. if (pad_char != '\0') { - while (char_count < width) { + while (display_width < width) { fprintf(out, "%c", pad_char); - char_count++; + display_width++; } } } diff --git a/src/format_tests.c b/src/format_tests.c @@ -458,10 +458,10 @@ UTEST(print_gopher_link_padded, MultiByte) { FILE* out = open_memstream(&buf, &size); ASSERT_NE(NULL, out); - print_gopher_link_padded(out, "こんにちは", 10, ' '); + print_gopher_link_padded(out, "こんにちは", 20, ' '); fclose(out); - EXPECT_STREQ("こんにちは ", buf); + EXPECT_STREQ("こんにちは ", buf); free(buf); } @@ -512,8 +512,8 @@ UTEST(print_gopher_link_padded, ComplexEmoji) { print_gopher_link_padded(out, emoji, 10, ' '); fclose(out); - /* Expect 1 emoji, 9 spaces. */ - EXPECT_STREQ("👩🏾‍⚕️ ", buf); + /* Expect 1 emoji, 8 spaces. */ + EXPECT_STREQ("👩🏾‍⚕️ ", buf); free(buf); } diff --git a/src/gout_tests_main.c b/src/gout_tests_main.c @@ -1,3 +1,9 @@ +#include "locale.h" #include "utest.h" -UTEST_MAIN() +UTEST_STATE(); + +int main(int argc, const char *const argv[]) { + setlocale(LC_ALL, "en_US.UTF-8"); + return utest_main(argc, argv); +}