commit e910caa7621acc383c9ee55ecd04ca5207b20970
parent f78b244670f76569d46dd8292f292cd43555b82c
Author: Chris Bracken <chris@bracken.jp>
Date: Fri, 13 Feb 2026 15:52:00 +0900
Rewrite print_gopher_link_padded
Use wchar API to determine char widths, with a bit of tweaking for
complex emoji sequences.
Diffstat:
3 files changed, 86 insertions(+), 105 deletions(-)
diff --git a/src/format.c b/src/format.c
@@ -4,6 +4,7 @@
#include <stdlib.h>
#include <string.h>
#include <time.h>
+#include <wchar.h>
void print_time(FILE* out, time_t time, int timezone_offset) {
// Reject any offset > 24 hours.
@@ -157,118 +158,92 @@ void print_gopher_link(FILE* out, const char* str) {
}
}
-static size_t utf8_char_length(const char* str) {
- unsigned char c = (unsigned char)*str;
- if (c < 0x80) {
- return 1;
- }
- if ((c & 0xe0) == 0xc0) {
- return 2;
- }
- if ((c & 0xf0) == 0xe0) {
- return 3;
- }
- if ((c & 0xf8) == 0xf0) {
- return 4;
- }
- // Invalid UTF-8 sequence. Treat as single byte.
- return 1;
-}
-
-static bool is_zwj(const char* str) {
- return str[0] == '\xe2' && str[1] == '\x80' && str[2] == '\x8d';
-}
-
-static bool is_variation_selector(const char* str) {
- return str[0] == '\xef' && str[1] == '\xb8' && str[2] == '\x8f';
-}
-
-static size_t get_char_width(wchar_t c) {
- // ASCII characters in the range [0-31] and [127-255] are control characters
- // or non-printable characters
- if ((c >= 0 && c <= 31) || (c >= 127 && c <= 255)) {
- return 1;
- }
-
- // Emoji range.
- if (c >= 0x1F300 && c <= 0x1F6FF) {
- return 2;
- }
-
- // Extended pictographic characters.
- if (c >= 0x1F900 && c <= 0x1F9FF) {
- return 2;
- }
-
- // Hiragana and Katakana ranges.
- if ((c >= 0x3040 && c <= 0x309F) || (c >= 0x30A0 && c <= 0x30FF)) {
- return 2;
- }
-
- // Kanji range.
- if (c >= 0x4E00 && c <= 0x9FFF) {
- return 2;
- }
- return 1;
-}
-
-// TODO(cbracken): There has to be a better way.
void print_gopher_link_padded(FILE* out,
const char* str,
size_t width,
char pad_char) {
- size_t str_len = strlen(str);
- size_t char_count = 0;
+ if (width == 0)
+ return;
+
+ size_t display_width = 0;
size_t last_char_width = 0;
- size_t byte_count = 0;
- while (byte_count < str_len) {
- size_t pos = byte_count;
- size_t char_length = utf8_char_length(&str[byte_count]);
- byte_count += char_length;
- if (char_count == width - 1 && byte_count < str_len) {
+ mbstate_t state;
+
+ const char* ptr = str;
+ size_t len = strlen(str);
+ const char* end = str + len;
+
+ while (ptr < end) {
+ wchar_t wc;
+ size_t bytes = mbrtowc(&wc, ptr, end - ptr, &state);
+ if (bytes == 0)
+ break;
+
+ if (bytes == (size_t)-1 || bytes == (size_t)-2) {
+ // Invalid (-1) or incomplete (-2) UTF-8. Consume 1 byte.
+ if (display_width == width - 1 && ptr + 1 < end) {
+ fprintf(out, "\u2026");
+ display_width++;
+ break;
+ } else if (display_width < width) {
+ fprintf(out, "%c", *ptr);
+ display_width++;
+ last_char_width = 1;
+ }
+ ptr++;
+ memset(&state, 0, sizeof(state));
+ continue;
+ }
+
+ int w = wcwidth(wc);
+ size_t char_width = (w < 0) ? 0 : w;
+
+ // Print ellipsis if one character from max width but more remains.
+ if (display_width == width - 1 && ptr + bytes < end) {
fprintf(out, "\u2026");
+ display_width++;
+ break;
+ }
+
+ // Stop if adding this character exceeds the max width.
+ if (display_width + char_width > width) {
+ break;
+ }
+
+ if (wc == L'|') {
+ fprintf(out, "\\|");
+ display_width++;
last_char_width = 1;
- char_count += last_char_width;
- } else if (char_count < width) {
- if (str[pos] == '|') {
- fprintf(out, "\\|");
- char_count++;
- last_char_width = 1;
- } else if (str[pos] == '\t') {
- for (size_t i = 0; i < 8 && char_count < width; i++) {
- fprintf(out, " ");
- char_count++;
- }
- last_char_width = 1;
- } else if (str[pos] == '\r' || str[pos] == '\n') {
- // Ignore.
- last_char_width = 0;
- } else if (char_length == 3 && is_zwj(&str[pos])) {
- fprintf(out, "\xe2\x80\x8d");
- char_count -= last_char_width;
- last_char_width = 0;
- } else if (char_length == 3 && is_variation_selector(&str[pos])) {
- fprintf(out, "\xef\xb8\x8f");
- char_count -= last_char_width;
- last_char_width = 0;
- } else {
- for (size_t i = pos; i < byte_count; i++) {
- fprintf(out, "%c", str[i]);
- }
- wchar_t wide_char;
- if (mbtowc(&wide_char, &str[pos], MB_CUR_MAX) >= 0) {
- last_char_width = get_char_width(wide_char);
- } else {
- last_char_width = 1;
- }
- char_count += last_char_width;
+ } else if (wc == L'\t') {
+ for (size_t i = 0; i < 8 && display_width < width; i++) {
+ fprintf(out, " ");
+ display_width++;
}
+ last_char_width = 1;
+ } else if (wc == L'\r' || wc == L'\n') {
+ // Ignore.
+ last_char_width = 0;
+ } else {
+ // Hack: handle zero-width joiner and variation selector.
+ // wcwidth lacks awareness of complex emoji modifier sequences.
+ if (wc == 0x200D || (wc >= 0xFE00 && wc <= 0xFE0F)) {
+ display_width -= last_char_width;
+ char_width = 0;
+ }
+
+ fprintf(out, "%.*s", (int)bytes, ptr);
+ display_width += char_width;
+ last_char_width = char_width;
}
+
+ ptr += bytes;
}
+
+ // Pad remainder.
if (pad_char != '\0') {
- while (char_count < width) {
+ while (display_width < width) {
fprintf(out, "%c", pad_char);
- char_count++;
+ display_width++;
}
}
}
diff --git a/src/format_tests.c b/src/format_tests.c
@@ -458,10 +458,10 @@ UTEST(print_gopher_link_padded, MultiByte) {
FILE* out = open_memstream(&buf, &size);
ASSERT_NE(NULL, out);
- print_gopher_link_padded(out, "こんにちは", 10, ' ');
+ print_gopher_link_padded(out, "こんにちは", 20, ' ');
fclose(out);
- EXPECT_STREQ("こんにちは ", buf);
+ EXPECT_STREQ("こんにちは ", buf);
free(buf);
}
@@ -512,8 +512,8 @@ UTEST(print_gopher_link_padded, ComplexEmoji) {
print_gopher_link_padded(out, emoji, 10, ' ');
fclose(out);
- /* Expect 1 emoji, 9 spaces. */
- EXPECT_STREQ("👩🏾⚕️ ", buf);
+ /* Expect 1 emoji, 8 spaces. */
+ EXPECT_STREQ("👩🏾⚕️ ", buf);
free(buf);
}
diff --git a/src/gout_tests_main.c b/src/gout_tests_main.c
@@ -1,3 +1,9 @@
+#include "locale.h"
#include "utest.h"
-UTEST_MAIN()
+UTEST_STATE();
+
+int main(int argc, const char *const argv[]) {
+ setlocale(LC_ALL, "en_US.UTF-8");
+ return utest_main(argc, argv);
+}