diff mbox series

[03/11] vt: properly support zero-width Unicode code points

Message ID 20250410011839.64418-4-nico@fluxnic.net
State New
Headers show
Series vt: implement proper Unicode handling | expand

Commit Message

Nicolas Pitre April 10, 2025, 1:13 a.m. UTC
From: Nicolas Pitre <npitre@baylibre.com>

Zero-width Unicode code points are causing misalignment in vertically
aligned content, disrupting the visual layout. Let's handle zero-width
code points more intelligently.

Double-width code points are stored in the screen grid followed by a white
space code point to create the expected screen layout. When a double-width
code point is followed by a zero-width code point in the console incoming
bytestream (e.g., an emoji with a presentation selector) then we may
replace the white space padding by that zero-width code point instead of
dropping it. This maximize screen content information while preserving
proper layout.

If a zero-width code point is preceded by a single-width code point then
the above trick is not possible and such zero-width code point must
be dropped.

VS16 (Variation Selector 16, U+FE0F) is special as it doubles the width
of the preceding single-width code point. We handle that case by giving
VS16 a width of 1 when that happens.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 drivers/tty/vt/vt.c        | 46 ++++++++++++++++++++++++++++++++++++--
 include/linux/consolemap.h | 10 +++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)

Comments

Jiri Slaby April 14, 2025, 6:51 a.m. UTC | #1
On 10. 04. 25, 3:13, Nicolas Pitre wrote:
> From: Nicolas Pitre <npitre@baylibre.com>
> 
> Zero-width Unicode code points are causing misalignment in vertically
> aligned content, disrupting the visual layout. Let's handle zero-width
> code points more intelligently.
...
> --- a/drivers/tty/vt/vt.c
> +++ b/drivers/tty/vt/vt.c
> @@ -443,6 +443,15 @@ static void vc_uniscr_scroll(struct vc_data *vc, unsigned int top,
>   	}
>   }
>   
> +static u32 vc_uniscr_getc(struct vc_data *vc, int relative_pos)
> +{
> +	int pos = vc->state.x + vc->vc_need_wrap + relative_pos;
> +
> +	if (vc->vc_uni_lines && pos >= 0 && pos < vc->vc_cols)

So that is:
   in_range(pos, 0, vc->vc_cols)
right?

> +		return vc->vc_uni_lines[vc->state.y][pos];
> +	return 0;
> +}
> +
>   static void vc_uniscr_copy_area(u32 **dst_lines,
>   				unsigned int dst_cols,
>   				unsigned int dst_rows,
> @@ -2905,18 +2914,49 @@ static bool vc_is_control(struct vc_data *vc, int tc, int c)
>   	return false;
>   }
>   
> +static void vc_con_rewind(struct vc_data *vc)
> +{
> +	if (vc->state.x && !vc->vc_need_wrap) {
> +		vc->vc_pos -= 2;
> +		vc->state.x--;
> +	}
> +	vc->vc_need_wrap = 0;
> +}
> +
>   static int vc_con_write_normal(struct vc_data *vc, int tc, int c,
>   		struct vc_draw_region *draw)
>   {
> -	int next_c;
> +	int next_c, prev_c;
>   	unsigned char vc_attr = vc->vc_attr;
>   	u16 himask = vc->vc_hi_font_mask, charmask = himask ? 0x1ff : 0xff;
>   	u8 width = 1;
>   	bool inverse = false;
>   
>   	if (vc->vc_utf && !vc->vc_disp_ctrl) {
> -		if (ucs_is_double_width(c))
> +		if (ucs_is_double_width(c)) {
>   			width = 2;
> +		} else if (ucs_is_zero_width(c)) {
> +			prev_c = vc_uniscr_getc(vc, -1);
> +			if (prev_c == ' ' &&
> +			    ucs_is_double_width(vc_uniscr_getc(vc, -2))) {
> +				/*
> +				 * Let's merge this zero-width code point with
> +				 * the preceding double-width code point by
> +				 * replacing the existing whitespace padding.
> +				 */
> +				vc_con_rewind(vc);
> +			} else if (c == 0xfe0f && prev_c != 0) {
> +				/*
> +				 * VS16 (U+FE0F) is special. Let it have a
> +				 * width of 1 when preceded by a single-width
> +				 * code point effectively making the later
> +				 * double-width.
> +				 */
> +			} else {
> +				/* Otherwise zero-width code points are ignored */
> +				goto out;
> +			}
> +		}

Please, extract this width evaluation to a separate function.

...
> --- a/include/linux/consolemap.h
> +++ b/include/linux/consolemap.h
...
> @@ -63,6 +68,11 @@ static inline bool ucs_is_double_width(uint32_t cp)
>   {
>   	return false;
>   }
> +
> +static inline bool ucs_is_zero_width(uint32_t cp)
> +{
> +	return false;
> +}

Again, is this necessary?

thanks,
Nicolas Pitre April 15, 2025, 7:06 p.m. UTC | #2
On Mon, 14 Apr 2025, Jiri Slaby wrote:

> On 10. 04. 25, 3:13, Nicolas Pitre wrote:
> > From: Nicolas Pitre <npitre@baylibre.com>
> > 
> > Zero-width Unicode code points are causing misalignment in vertically
> > aligned content, disrupting the visual layout. Let's handle zero-width
> > code points more intelligently.
> ...
> > --- a/drivers/tty/vt/vt.c
> > +++ b/drivers/tty/vt/vt.c
> > @@ -443,6 +443,15 @@ static void vc_uniscr_scroll(struct vc_data *vc,
> > unsigned int top,
> >   	}
> >   }
> >   
> > +static u32 vc_uniscr_getc(struct vc_data *vc, int relative_pos)
> > +{
> > +	int pos = vc->state.x + vc->vc_need_wrap + relative_pos;
> > +
> > +	if (vc->vc_uni_lines && pos >= 0 && pos < vc->vc_cols)
> 
> So that is:
>   in_range(pos, 0, vc->vc_cols)
> right?

Good idea. Didn't know about that one.

> >   	if (vc->vc_utf && !vc->vc_disp_ctrl) {
> > -		if (ucs_is_double_width(c))
> > +		if (ucs_is_double_width(c)) {
> >   			width = 2;
> > +		} else if (ucs_is_zero_width(c)) {
> > +			prev_c = vc_uniscr_getc(vc, -1);
> > +			if (prev_c == ' ' &&
> > +			    ucs_is_double_width(vc_uniscr_getc(vc, -2))) {
> > +				/*
> > +				 * Let's merge this zero-width code point with
> > +				 * the preceding double-width code point by
> > +				 * replacing the existing whitespace padding.
> > +				 */
> > +				vc_con_rewind(vc);
> > +			} else if (c == 0xfe0f && prev_c != 0) {
> > +				/*
> > +				 * VS16 (U+FE0F) is special. Let it have a
> > +				 * width of 1 when preceded by a single-width
> > +				 * code point effectively making the later
> > +				 * double-width.
> > +				 */
> > +			} else {
> > +				/* Otherwise zero-width code points are
> > ignored */
> > +				goto out;
> > +			}
> > +		}
> 
> Please, extract this width evaluation to a separate function.

Done.
diff mbox series

Patch

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index bcb508bc15..5d53feeb5d 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -443,6 +443,15 @@  static void vc_uniscr_scroll(struct vc_data *vc, unsigned int top,
 	}
 }
 
+static u32 vc_uniscr_getc(struct vc_data *vc, int relative_pos)
+{
+	int pos = vc->state.x + vc->vc_need_wrap + relative_pos;
+
+	if (vc->vc_uni_lines && pos >= 0 && pos < vc->vc_cols)
+		return vc->vc_uni_lines[vc->state.y][pos];
+	return 0;
+}
+
 static void vc_uniscr_copy_area(u32 **dst_lines,
 				unsigned int dst_cols,
 				unsigned int dst_rows,
@@ -2905,18 +2914,49 @@  static bool vc_is_control(struct vc_data *vc, int tc, int c)
 	return false;
 }
 
+static void vc_con_rewind(struct vc_data *vc)
+{
+	if (vc->state.x && !vc->vc_need_wrap) {
+		vc->vc_pos -= 2;
+		vc->state.x--;
+	}
+	vc->vc_need_wrap = 0;
+}
+
 static int vc_con_write_normal(struct vc_data *vc, int tc, int c,
 		struct vc_draw_region *draw)
 {
-	int next_c;
+	int next_c, prev_c;
 	unsigned char vc_attr = vc->vc_attr;
 	u16 himask = vc->vc_hi_font_mask, charmask = himask ? 0x1ff : 0xff;
 	u8 width = 1;
 	bool inverse = false;
 
 	if (vc->vc_utf && !vc->vc_disp_ctrl) {
-		if (ucs_is_double_width(c))
+		if (ucs_is_double_width(c)) {
 			width = 2;
+		} else if (ucs_is_zero_width(c)) {
+			prev_c = vc_uniscr_getc(vc, -1);
+			if (prev_c == ' ' &&
+			    ucs_is_double_width(vc_uniscr_getc(vc, -2))) {
+				/*
+				 * Let's merge this zero-width code point with
+				 * the preceding double-width code point by
+				 * replacing the existing whitespace padding.
+				 */
+				vc_con_rewind(vc);
+			} else if (c == 0xfe0f && prev_c != 0) {
+				/*
+				 * VS16 (U+FE0F) is special. Let it have a
+				 * width of 1 when preceded by a single-width
+				 * code point effectively making the later
+				 * double-width.
+				 */
+			} else {
+				/* Otherwise zero-width code points are ignored */
+				goto out;
+			}
+		}
 	}
 
 	/* Now try to find out how to display it */
@@ -2995,6 +3035,8 @@  static int vc_con_write_normal(struct vc_data *vc, int tc, int c,
 			tc = ' ';
 		next_c = ' ';
 	}
+
+out:
 	notify_write(vc, c);
 
 	if (inverse)
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index caf079bcb8..7d778752dc 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -29,6 +29,11 @@  u32 conv_8bit_to_uni(unsigned char c);
 int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 bool ucs_is_double_width(uint32_t cp);
+static inline bool ucs_is_zero_width(uint32_t cp)
+{
+	/* coming soon */
+	return false;
+}
 #else
 static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
 		bool use_unicode)
@@ -63,6 +68,11 @@  static inline bool ucs_is_double_width(uint32_t cp)
 {
 	return false;
 }
+
+static inline bool ucs_is_zero_width(uint32_t cp)
+{
+	return false;
+}
 #endif /* CONFIG_CONSOLE_TRANSLATIONS */
 
 #endif /* __LINUX_CONSOLEMAP_H__ */