From 09cabcad5dda8bc74b527de2b70adadc42f107df Mon Sep 17 00:00:00 2001
From: Benno Schulenberg <bensberg@justemail.net>
Date: Tue, 2 May 2017 13:05:58 +0200
Subject: [PATCH] chars: probe for a valid UTF-8 starter byte, instead of
 overstepping

Instead of always stepping back four bytes and then tentatively
moving forward again (which is wasteful when most codes are just
one or two bytes long), inspect the preceding bytes one by one
and begin the move forward at the first valid starter byte.

This reduces the backwards searching time by close to 40 percent.
---
 src/chars.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/chars.c b/src/chars.c
index bf0a2689..0eff611c 100644
--- a/src/chars.c
+++ b/src/chars.c
@@ -383,8 +383,20 @@ size_t move_mbleft(const char *buf, size_t pos)
      * possible point. */
     if (pos < 4)
 	before = 0;
-    else
-	before = pos - 4;
+    else {
+	const char *ptr = buf + pos;
+
+       if ((signed char)*(--ptr) > -65)
+	    before = pos - 1;
+       else if ((signed char)*(--ptr) > -65)
+	    before = pos - 2;
+       else if ((signed char)*(--ptr) > -65)
+	    before = pos - 3;
+       else if ((signed char)*(--ptr) > -65)
+	    before = pos - 4;
+	else
+	    before = pos - 1;
+    }
 
     while (before < pos) {
 	char_len = parse_mbchar(buf + before, NULL, NULL);
-- 
GitLab