Rewrite print_gopher_link_padded - gout - A static git page generator

commit e910caa7621acc383c9ee55ecd04ca5207b20970
parent f78b244670f76569d46dd8292f292cd43555b82c
Author: Chris Bracken <chris@bracken.jp>
Date:   Fri, 13 Feb 2026 15:52:00 +0900

Rewrite print_gopher_link_padded

Use wchar API to determine char widths, with a bit of tweaking for
complex emoji sequences.

Diffstat:
M src/format.c  | 175 +++++++++++++++++++++++++++++++++--------------------------------------------
M src/format_tests.c  | 8 ++++----
M src/gout_tests_main.c  | 8 +++++++-

3 files changed, 86 insertions(+), 105 deletions(-)
diff --git a/src/format.c b/src/format.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <wchar.h>
 
 void print_time(FILE* out, time_t time, int timezone_offset) {
   // Reject any offset > 24 hours.
@@ -157,118 +158,92 @@ void print_gopher_link(FILE* out, const char* str) {
   }
 }
 
-static size_t utf8_char_length(const char* str) {
-  unsigned char c = (unsigned char)*str;
-  if (c < 0x80) {
-    return 1;
-  }
-  if ((c & 0xe0) == 0xc0) {
-    return 2;
-  }
-  if ((c & 0xf0) == 0xe0) {
-    return 3;
-  }
-  if ((c & 0xf8) == 0xf0) {
-    return 4;
-  }
-  // Invalid UTF-8 sequence. Treat as single byte.
-  return 1;
-}
-
-static bool is_zwj(const char* str) {
-  return str[0] == '\xe2' && str[1] == '\x80' && str[2] == '\x8d';
-}
-
-static bool is_variation_selector(const char* str) {
-  return str[0] == '\xef' && str[1] == '\xb8' && str[2] == '\x8f';
-}
-
-static size_t get_char_width(wchar_t c) {
-  // ASCII characters in the range [0-31] and [127-255] are control characters
-  // or non-printable characters
-  if ((c >= 0 && c <= 31) || (c >= 127 && c <= 255)) {
-    return 1;
-  }
-
-  // Emoji range.
-  if (c >= 0x1F300 && c <= 0x1F6FF) {
-    return 2;
-  }
-
-  // Extended pictographic characters.
-  if (c >= 0x1F900 && c <= 0x1F9FF) {
-    return 2;
-  }
-
-  // Hiragana and Katakana ranges.
-  if ((c >= 0x3040 && c <= 0x309F) || (c >= 0x30A0 && c <= 0x30FF)) {
-    return 2;
-  }
-
-  // Kanji range.
-  if (c >= 0x4E00 && c <= 0x9FFF) {
-    return 2;
-  }
-  return 1;
-}
-
-// TODO(cbracken): There has to be a better way.
 void print_gopher_link_padded(FILE* out,
                               const char* str,
                               size_t width,
                               char pad_char) {
-  size_t str_len = strlen(str);
-  size_t char_count = 0;
+  if (width == 0)
+    return;
+
+  size_t display_width = 0;
   size_t last_char_width = 0;
-  size_t byte_count = 0;
-  while (byte_count < str_len) {
-    size_t pos = byte_count;
-    size_t char_length = utf8_char_length(&str[byte_count]);
-    byte_count += char_length;
-    if (char_count == width - 1 && byte_count < str_len) {
+  mbstate_t state;
+
+  const char* ptr = str;
+  size_t len = strlen(str);
+  const char* end = str + len;
+
+  while (ptr < end) {
+    wchar_t wc;
+    size_t bytes = mbrtowc(&wc, ptr, end - ptr, &state);
+    if (bytes == 0)
+      break;
+
+    if (bytes == (size_t)-1 || bytes == (size_t)-2) {
+      // Invalid (-1) or incomplete (-2) UTF-8. Consume 1 byte.
+      if (display_width == width - 1 && ptr + 1 < end) {
+        fprintf(out, "\u2026");
+        display_width++;
+        break;
+      } else if (display_width < width) {
+        fprintf(out, "%c", *ptr);
+        display_width++;
+        last_char_width = 1;
+      }
+      ptr++;
+      memset(&state, 0, sizeof(state));
+      continue;
+    }
+
+    int w = wcwidth(wc);
+    size_t char_width = (w < 0) ? 0 : w;
+
+    // Print ellipsis if one character from max width but more remains.
+    if (display_width == width - 1 && ptr + bytes < end) {
       fprintf(out, "\u2026");
+      display_width++;
+      break;
+    }
+
+    // Stop if adding this character exceeds the max width.
+    if (display_width + char_width > width) {
+      break;
+    }
+
+    if (wc == L'|') {
+      fprintf(out, "\\|");
+      display_width++;
       last_char_width = 1;
-      char_count += last_char_width;
-    } else if (char_count < width) {
-      if (str[pos] == '|') {
-        fprintf(out, "\\|");
-        char_count++;
-        last_char_width = 1;
-      } else if (str[pos] == '\t') {
-        for (size_t i = 0; i < 8 && char_count < width; i++) {
-          fprintf(out, " ");
-          char_count++;
-        }
-        last_char_width = 1;
-      } else if (str[pos] == '\r' || str[pos] == '\n') {
-        // Ignore.
-        last_char_width = 0;
-      } else if (char_length == 3 && is_zwj(&str[pos])) {
-        fprintf(out, "\xe2\x80\x8d");
-        char_count -= last_char_width;
-        last_char_width = 0;
-      } else if (char_length == 3 && is_variation_selector(&str[pos])) {
-        fprintf(out, "\xef\xb8\x8f");
-        char_count -= last_char_width;
-        last_char_width = 0;
-      } else {
-        for (size_t i = pos; i < byte_count; i++) {
-          fprintf(out, "%c", str[i]);
-        }
-        wchar_t wide_char;
-        if (mbtowc(&wide_char, &str[pos], MB_CUR_MAX) >= 0) {
-          last_char_width = get_char_width(wide_char);
-        } else {
-          last_char_width = 1;
-        }
-        char_count += last_char_width;
+    } else if (wc == L'\t') {
+      for (size_t i = 0; i < 8 && display_width < width; i++) {
+        fprintf(out, " ");
+        display_width++;
       }
+      last_char_width = 1;
+    } else if (wc == L'\r' || wc == L'\n') {
+      // Ignore.
+      last_char_width = 0;
+    } else {
+      // Hack: handle zero-width joiner and variation selector.
+      // wcwidth lacks awareness of complex emoji modifier sequences.
+      if (wc == 0x200D || (wc >= 0xFE00 && wc <= 0xFE0F)) {
+        display_width -= last_char_width;
+        char_width = 0;
+      }
+
+      fprintf(out, "%.*s", (int)bytes, ptr);
+      display_width += char_width;
+      last_char_width = char_width;
     }
+
+    ptr += bytes;
   }
+
+  // Pad remainder.
   if (pad_char != '\0') {
-    while (char_count < width) {
+    while (display_width < width) {
       fprintf(out, "%c", pad_char);
-      char_count++;
+      display_width++;
     }
   }
 }
diff --git a/src/format_tests.c b/src/format_tests.c
@@ -458,10 +458,10 @@ UTEST(print_gopher_link_padded, MultiByte) {
   FILE* out = open_memstream(&buf, &size);
   ASSERT_NE(NULL, out);
 
-  print_gopher_link_padded(out, "こんにちは", 10, ' ');
+  print_gopher_link_padded(out, "こんにちは", 20, ' ');
   fclose(out);
 
-  EXPECT_STREQ("こんにちは     ", buf);
+  EXPECT_STREQ("こんにちは          ", buf);
 
   free(buf);
 }
@@ -512,8 +512,8 @@ UTEST(print_gopher_link_padded, ComplexEmoji) {
   print_gopher_link_padded(out, emoji, 10, ' ');
   fclose(out);
 
-  /* Expect 1 emoji, 9 spaces. */
-  EXPECT_STREQ("👩🏾‍⚕️         ", buf);
+  /* Expect 1 emoji, 8 spaces. */
+  EXPECT_STREQ("👩🏾‍⚕️        ", buf);
 
   free(buf);
 }
diff --git a/src/gout_tests_main.c b/src/gout_tests_main.c
@@ -1,3 +1,9 @@
+#include "locale.h"
 #include "utest.h"
 
-UTEST_MAIN()
+UTEST_STATE();
+
+int main(int argc, const char *const argv[]) {
+  setlocale(LC_ALL, "en_US.UTF-8"); 
+  return utest_main(argc, argv);
+}

	gout A static git page generator
	git clone https://git.bracken.jp/gout.git
	Log \| Files \| Refs \| README \| LICENSE

M	src/format.c	\|	175	+++++++++++++++++++++++++++++++++--------------------------------------------
M	src/format_tests.c	\|	8	++++----
M	src/gout_tests_main.c	\|	8	+++++++-