diff mbox series

[v2,09/13] vt: support Unicode recomposition

Message ID 20250415192212.33949-10-nico@fluxnic.net
State Superseded
Headers show
Series vt: implement proper Unicode handling | expand

Commit Message

Nicolas Pitre April 15, 2025, 7:17 p.m. UTC
From: Nicolas Pitre <npitre@baylibre.com>

Try replacing any decomposed Unicode sequence by the corresponding
recomposed code point. Code point to glyph correspondance works best
after recomposition, and this apply mostly to single-width code points
therefore we can't preserve them in their decomposed form anyway.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
 drivers/tty/vt/ucs.c       | 62 ++++++++++++++++++++++++++++++++++++++
 drivers/tty/vt/vt.c        | 14 +++++++--
 include/linux/consolemap.h |  6 ++++
 3 files changed, 79 insertions(+), 3 deletions(-)

Comments

Jiri Slaby April 16, 2025, 5:07 a.m. UTC | #1
On 15. 04. 25, 21:17, Nicolas Pitre wrote:
> From: Nicolas Pitre <npitre@baylibre.com>
> 
> Try replacing any decomposed Unicode sequence by the corresponding
> recomposed code point. Code point to glyph correspondance works best
> after recomposition, and this apply mostly to single-width code points
> therefore we can't preserve them in their decomposed form anyway.
> 
> Signed-off-by: Nicolas Pitre <npitre@baylibre.com>

Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
diff mbox series

Patch

diff --git a/drivers/tty/vt/ucs.c b/drivers/tty/vt/ucs.c
index 5e71aa3896..07b2bd1714 100644
--- a/drivers/tty/vt/ucs.c
+++ b/drivers/tty/vt/ucs.c
@@ -56,3 +56,65 @@  bool ucs_is_double_width(u32 cp)
 	return cp_in_range(cp, ucs_double_width_ranges,
 			   ARRAY_SIZE(ucs_double_width_ranges));
 }
+
+/*
+ * Structure for base with combining mark pairs and resulting recompositions.
+ * Using u16 to save space since all values are within BMP range.
+ */
+struct ucs_recomposition {
+	u16 base;	/* base character */
+	u16 mark;	/* combining mark */
+	u16 recomposed;	/* corresponding recomposed character */
+};
+
+#include "ucs_recompose_table.h"
+
+struct compare_key {
+	u16 base;
+	u16 mark;
+};
+
+static int recomposition_cmp(const void *key, const void *element)
+{
+	const struct compare_key *search_key = key;
+	const struct ucs_recomposition *entry = element;
+
+	/* Compare base character first */
+	if (search_key->base < entry->base)
+		return -1;
+	if (search_key->base > entry->base)
+		return 1;
+
+	/* Base characters match, now compare combining character */
+	if (search_key->mark < entry->mark)
+		return -1;
+	if (search_key->mark > entry->mark)
+		return 1;
+
+	/* Both match */
+	return 0;
+}
+
+/**
+ * Attempt to recompose two Unicode characters into a single character.
+ *
+ * @param base: Base Unicode code point (UCS-4)
+ * @param mark: Combining mark Unicode code point (UCS-4)
+ * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
+ */
+u32 ucs_recompose(u32 base, u32 mark)
+{
+	/* Check if characters are within the range of our table */
+	if (!in_range(base, UCS_RECOMPOSE_MIN_BASE, UCS_RECOMPOSE_MAX_BASE) ||
+	    !in_range(mark, UCS_RECOMPOSE_MIN_MARK, UCS_RECOMPOSE_MAX_MARK))
+		return 0;
+
+	struct compare_key key = { base, mark };
+	struct ucs_recomposition *result =
+		__inline_bsearch(&key, ucs_recomposition_table,
+				 ARRAY_SIZE(ucs_recomposition_table),
+				 sizeof(*ucs_recomposition_table),
+				 recomposition_cmp);
+
+	return result ? result->recomposed : 0;
+}
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index a989feffad..76554c2040 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -2925,9 +2925,9 @@  static void vc_con_rewind(struct vc_data *vc)
 
 #define UCS_VS16	0xfe0f	/* Variation Selector 16 */
 
-static int vc_process_ucs(struct vc_data *vc, int c, int *tc)
+static int vc_process_ucs(struct vc_data *vc, int *c, int *tc)
 {
-	u32 prev_c, curr_c = c;
+	u32 prev_c, curr_c = *c;
 
 	if (ucs_is_double_width(curr_c))
 		return 2;
@@ -2964,6 +2964,14 @@  static int vc_process_ucs(struct vc_data *vc, int c, int *tc)
 		return 1;
 	}
 
+	/* try recomposition */
+	prev_c = ucs_recompose(prev_c, curr_c);
+	if (prev_c != 0) {
+		vc_con_rewind(vc);
+		*tc = *c = prev_c;
+		return 1;
+	}
+
 	/* Otherwise zero-width code points are ignored. */
 	return 0;
 }
@@ -2978,7 +2986,7 @@  static int vc_con_write_normal(struct vc_data *vc, int tc, int c,
 	bool inverse = false;
 
 	if (vc->vc_utf && !vc->vc_disp_ctrl) {
-		width = vc_process_ucs(vc, c, &tc);
+		width = vc_process_ucs(vc, &c, &tc);
 		if (!width)
 			goto out;
 	}
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index b3a9118666..8167494229 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -30,6 +30,7 @@  int conv_uni_to_8bit(u32 uni);
 void console_map_init(void);
 bool ucs_is_double_width(uint32_t cp);
 bool ucs_is_zero_width(uint32_t cp);
+u32 ucs_recompose(u32 base, u32 mark);
 #else
 static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
 		bool use_unicode)
@@ -69,6 +70,11 @@  static inline bool ucs_is_zero_width(uint32_t cp)
 {
 	return false;
 }
+
+static inline u32 ucs_recompose(u32 base, u32 mark)
+{
+	return 0;
+}
 #endif /* CONFIG_CONSOLE_TRANSLATIONS */
 
 #endif /* __LINUX_CONSOLEMAP_H__ */