chars.c 23.9 KB
Newer Older
1
2
3
4
/* $Id$ */
/**************************************************************************
 *   chars.c                                                              *
 *                                                                        *
5
 *   Copyright (C) 2001-2004 Chris Allegretta                             *
6
 *   Copyright (C) 2005-2006 David Lawrence Ramsey                        *
7
8
9
10
11
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
 *   the Free Software Foundation; either version 2, or (at your option)  *
 *   any later version.                                                   *
 *                                                                        *
12
13
14
15
 *   This program is distributed in the hope that it will be useful, but  *
 *   WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    *
 *   General Public License for more details.                             *
16
17
18
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
19
20
 *   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA            *
 *   02110-1301, USA.                                                     *
21
22
23
 *                                                                        *
 **************************************************************************/

24
#include "proto.h"
25

26
#include <string.h>
27
28
#include <ctype.h>

29
#ifdef ENABLE_UTF8
30
#ifdef HAVE_WCHAR_H
31
32
#include <wchar.h>
#endif
33
#ifdef HAVE_WCTYPE_H
34
35
#include <wctype.h>
#endif
36

37
38
static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */
39
40
41
42
43
static const wchar_t bad_wchar = 0xFFFD;
	/* If we get an invalid multibyte sequence, we treat it as
	 * Unicode FFFD (Replacement Character), unless we're
	 * determining if it's a control character or searching for a
	 * match to it. */
44
45
static const char *bad_mbchar = "\xEF\xBF\xBD";
static const int bad_mbchar_len = 3;
46
47
48
49
50
51
52
53
54
55
56
57

/* Enable UTF-8 support. */
void utf8_init(void)
{
    use_utf8 = TRUE;
}

/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
    return use_utf8;
}
58
#endif
59

60
61
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
62
bool nisblank(int c)
63
{
64
    return isspace(c) && (c == '\t' || !is_cntrl_char(c));
65
}
66
#endif
67

68
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
69
/* This function is equivalent to iswblank(). */
70
bool niswblank(wchar_t wc)
71
{
72
    return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
73
}
74
#endif
75

76
77
78
79
80
81
82
/* Return TRUE if the value of c is in byte range, and FALSE
 * otherwise. */
bool is_byte(int c)
{
    return ((unsigned int)c == (unsigned char)c);
}

83
84
85
86
87
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
    assert(c != NULL);

88
#ifdef ENABLE_UTF8
89
    if (use_utf8) {
90
91
	wchar_t wc;

92
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
93
	    mbtowc(NULL, NULL, 0);
94
	    wc = bad_wchar;
95
96
97
98
99
100
101
102
	}

	return iswalnum(wc);
    } else
#endif
	return isalnum((unsigned char)*c);
}

103
104
105
106
107
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
    assert(c != NULL);

108
#ifdef ENABLE_UTF8
109
    if (use_utf8) {
110
111
	wchar_t wc;

112
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
113
	    mbtowc(NULL, NULL, 0);
114
	    wc = bad_wchar;
115
116
	}

117
	return iswblank(wc);
118
119
    } else
#endif
120
	return isblank((unsigned char)*c);
121
122
}

123
124
125
126
127
128
129
/* This function is equivalent to iscntrl(), except in that it only
 * handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
    return (0 <= c && c < 32);
}

130
/* This function is equivalent to iscntrl(), except in that it also
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
131
 * handles high-bit control characters. */
132
bool is_cntrl_char(int c)
133
{
134
135
    return (-128 <= c && c < -96) || (0 <= c && c < 32) ||
	(127 <= c && c < 160);
136
137
}

138
#ifdef ENABLE_UTF8
139
140
141
/* This function is equivalent to iscntrl() for wide characters, except
 * in that it also handles wide control characters with their high bits
 * set. */
142
bool is_cntrl_wchar(wchar_t wc)
143
{
144
    return (0 <= wc && wc < 32) || (127 <= wc && wc < 160);
145
146
147
}
#endif

148
149
150
151
152
153
154
/* This function is equivalent to iscntrl() for multibyte characters,
 * except in that it also handles multibyte control characters with
 * their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
    assert(c != NULL);

155
#ifdef ENABLE_UTF8
156
    if (use_utf8) {
157
158
	wchar_t wc;

159
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
160
	    mbtowc(NULL, NULL, 0);
161
	    wc = bad_wchar;
162
163
164
165
166
167
168
169
	}

	return is_cntrl_wchar(wc);
    } else
#endif
	return is_cntrl_char((unsigned char)*c);
}

170
171
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
172
173
174
{
    assert(c != NULL);

175
#ifdef ENABLE_UTF8
176
    if (use_utf8) {
177
178
179
	wchar_t wc;
	int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);

180
	if (c_mb_len < 0) {
181
	    mbtowc(NULL, NULL, 0);
182
	    wc = bad_wchar;
183
184
	}

185
	return iswpunct(wc);
186
187
    } else
#endif
188
189
190
191
	return ispunct((unsigned char)*c);
}

/* Return TRUE for a multibyte character found in a word (currently only
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
192
 * an alphanumeric or punctuation character, and only the latter if
193
194
195
196
197
198
199
 * allow_punct is TRUE) and FALSE otherwise. */
bool is_word_mbchar(const char *c, bool allow_punct)
{
    assert(c != NULL);

    return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) :
	FALSE);
200
201
}

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
202
/* c is a control character.  It displays as ^@, ^?, or ^[ch], where ch
203
 * is (c + 64).  We return that character. */
204
char control_rep(char c)
205
{
206
207
    assert(is_cntrl_char(c));

208
209
210
211
212
213
214
215
216
    /* Treat newlines embedded in a line as encoded nulls. */
    if (c == '\n')
	return '@';
    else if (c == NANO_CONTROL_8)
	return '?';
    else
	return c + 64;
}

217
#ifdef ENABLE_UTF8
218
/* c is a wide control character.  It displays as ^@, ^?, or ^[ch],
219
 * where ch is (c + 64).  We return that wide character. */
220
221
wchar_t control_wrep(wchar_t wc)
{
222
223
    assert(is_cntrl_wchar(wc));

224
225
226
227
228
229
230
231
232
233
234
    /* Treat newlines embedded in a line as encoded nulls. */
    if (wc == '\n')
	return '@';
    else if (wc == NANO_CONTROL_8)
	return '?';
    else
	return wc + 64;
}
#endif

/* c is a multibyte control character.  It displays as ^@, ^?, or ^[ch],
235
236
 * where ch is (c + 64).  We return that multibyte character.  If crep
 * is an invalid multibyte sequence, it will be replaced with Unicode
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
237
 * 0xFFFD (Replacement Character). */
238
239
char *control_mbrep(const char *c, char *crep, int *crep_len)
{
240
    assert(c != NULL && crep != NULL && crep_len != NULL);
241

242
#ifdef ENABLE_UTF8
243
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
244
	wchar_t wc;
245

246
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
247
	    mbtowc(NULL, NULL, 0);
248
	    *crep_len = bad_mbchar_len;
249
	    strncpy(crep, bad_mbchar, *crep_len);
250
251
252
253
254
255
256
	} else {
	    *crep_len = wctomb(crep, control_wrep(wc));

	    if (*crep_len < 0) {
		wctomb(NULL, 0);
		*crep_len = 0;
	    }
257
258
259
260
	}
    } else {
#endif
	*crep_len = 1;
261
	*crep = control_rep(*c);
262
#ifdef ENABLE_UTF8
263
264
    }
#endif
265
266

    return crep;
267
268
}

269
/* c is a multibyte non-control character.  We return that multibyte
270
 * character.  If crep is an invalid multibyte sequence, it will be
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
271
 * replaced with Unicode 0xFFFD (Replacement Character). */
272
273
274
275
276
char *mbrep(const char *c, char *crep, int *crep_len)
{
    assert(c != NULL && crep != NULL && crep_len != NULL);

#ifdef ENABLE_UTF8
277
    if (use_utf8) {
278
279
	wchar_t wc;

280
281
	/* Reject invalid Unicode characters. */
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
282
283
	    mbtowc(NULL, NULL, 0);
	    *crep_len = bad_mbchar_len;
284
	    strncpy(crep, bad_mbchar, *crep_len);
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
	} else {
	    *crep_len = wctomb(crep, wc);

	    if (*crep_len < 0) {
		wctomb(NULL, 0);
		*crep_len = 0;
	    }
	}
    } else {
#endif
	*crep_len = 1;
	*crep = *c;
#ifdef ENABLE_UTF8
    }
#endif

    return crep;
}

304
305
306
307
308
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
    assert(c != NULL);

309
#ifdef ENABLE_UTF8
310
    if (use_utf8) {
311
	wchar_t wc;
312
	int width;
313

314
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
315
	    mbtowc(NULL, NULL, 0);
316
	    wc = bad_wchar;
317
318
319
	}

	width = wcwidth(wc);
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
320

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
321
322
323
324
	if (width == -1) {
	    wc = bad_wchar;
	    width = wcwidth(wc);
	}
325
326
327
328
329
330
331
332
333
334

	return width;
    } else
#endif
	return 1;
}

/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
335
    return
336
#ifdef ENABLE_UTF8
337
	use_utf8 ? MB_CUR_MAX :
338
#endif
339
	1;
340
341
}

342
343
/* Convert the Unicode value in chr to a multibyte character with the
 * same wide character value as chr, if possible.  If the conversion
344
345
346
 * succeeds, return the (dynamically allocated) multibyte character and
 * its length.  Otherwise, return an undefined (dynamically allocated)
 * multibyte character and a length of zero. */
347
char *make_mbchar(long chr, int *chr_mb_len)
348
{
349
350
    char *chr_mb;

351
    assert(chr_mb_len != NULL);
352

353
#ifdef ENABLE_UTF8
354
    if (use_utf8) {
355
	chr_mb = charalloc(MB_CUR_MAX);
356
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
357

358
359
	/* Reject invalid Unicode characters. */
	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
360
361
	    wctomb(NULL, 0);
	    *chr_mb_len = 0;
362
363
364
365
	}
    } else {
#endif
	*chr_mb_len = 1;
366
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
367
#ifdef ENABLE_UTF8
368
369
370
371
372
373
374
375
    }
#endif

    return chr_mb;
}

/* Parse a multibyte character from buf.  Return the number of bytes
 * used.  If chr isn't NULL, store the multibyte character in it.  If
376
377
378
 * col isn't NULL, store the new display width in it.  If *buf is '\t',
 * we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
379
380
381
382
383
{
    int buf_mb_len;

    assert(buf != NULL);

384
#ifdef ENABLE_UTF8
385
    if (use_utf8) {
386
387
388
	/* Get the number of bytes in the multibyte character. */
	buf_mb_len = mblen(buf, MB_CUR_MAX);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
389
390
	/* If buf contains an invalid multibyte character, only
	 * interpret buf's first byte. */
391
	if (buf_mb_len < 0) {
392
	    mblen(NULL, 0);
393
	    buf_mb_len = 1;
394
395
	} else if (buf_mb_len == 0)
	    buf_mb_len++;
396
397
398
399

	/* Save the multibyte character in chr. */
	if (chr != NULL) {
	    int i;
400

401
402
403
404
405
406
407
408
409
410
411
412
413
	    for (i = 0; i < buf_mb_len; i++)
		chr[i] = buf[i];
	}

	/* Save the column width of the wide character in col. */
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, get its width using one
	     * column for the "^" that will be displayed in front of it,
	     * and the width in columns of its visible equivalent as
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
414
	     * returned by control_mbrep(). */
415
	    else if (is_cntrl_mbchar(buf)) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
416
		char *ctrl_buf_mb = charalloc(MB_CUR_MAX);
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
		int ctrl_buf_mb_len;

		(*col)++;

		ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb,
			&ctrl_buf_mb_len);

		*col += mbwidth(ctrl_buf_mb);

		free(ctrl_buf_mb);
	    /* If we have a normal character, get its width in columns
	     * normally. */
	    } else
		*col += mbwidth(buf);
	}
    } else {
#endif
	/* Get the number of bytes in the byte character. */
	buf_mb_len = 1;

	/* Save the byte character in chr. */
	if (chr != NULL)
	    *chr = *buf;

	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, it's two columns wide:
	     * one column for the "^" that will be displayed in front of
	     * it, and one column for its visible equivalent as returned
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
449
	     * by control_mbrep(). */
450
451
452
453
454
455
	    else if (is_cntrl_char((unsigned char)*buf))
		*col += 2;
	    /* If we have a normal character, it's one column wide. */
	    else
		(*col)++;
	}
456
#ifdef ENABLE_UTF8
457
458
459
460
461
    }
#endif

    return buf_mb_len;
}
462
463
464
465
466
467
468

/* Return the index in buf of the beginning of the multibyte character
 * before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
    size_t pos_prev = pos;

469
    assert(buf != NULL && pos <= strlen(buf));
470
471
472
473

    /* There is no library function to move backward one multibyte
     * character.  Here is the naive, O(pos) way to do it. */
    while (TRUE) {
474
	int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL);
475

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
476
	if (pos_prev <= buf_mb_len)
477
478
479
480
481
482
483
484
485
486
487
488
	    break;

	pos_prev -= buf_mb_len;
    }

    return pos - pos_prev;
}

/* Return the index in buf of the beginning of the multibyte character
 * after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
489
    return pos + parse_mbchar(buf + pos, NULL, NULL);
490
}
491
492
493
494
495

#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
496
    return strncasecmp(s1, s2, (size_t)-1);
497
498
499
500
501
502
}
#endif

/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
503
    return mbstrncasecmp(s1, s2, (size_t)-1);
504
505
506
507
508
509
510
511
512
513
514
515
516
517
}

#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
    assert(s1 != NULL && s2 != NULL);

    for (; n > 0 && *s1 != '\0' && *s2 != '\0'; n--, s1++, s2++) {
	if (tolower(*s1) != tolower(*s2))
	    break;
    }

    if (n > 0)
518
	return tolower(*s1) - tolower(*s2);
519
520
521
522
523
524
525
526
527
    else
	return 0;
}
#endif

/* This function is equivalent to strncasecmp() for multibyte
 * strings. */
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
528
#ifdef ENABLE_UTF8
529
    if (use_utf8) {
530
	char *s1_mb, *s2_mb;
531
532
	wchar_t ws1, ws2;

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
533
534
	assert(s1 != NULL && s2 != NULL);

535
536
537
	s1_mb = charalloc(MB_CUR_MAX);
	s2_mb = charalloc(MB_CUR_MAX);

538
	while (n > 0 && *s1 != '\0' && *s2 != '\0') {
539
	    bool bad_s1_mb = FALSE, bad_s2_mb = FALSE;
540
541
	    int s1_mb_len, s2_mb_len;

542
	    s1_mb_len = parse_mbchar(s1, s1_mb, NULL);
543

544
	    if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
545
546
		mbtowc(NULL, NULL, 0);
		ws1 = (unsigned char)*s1_mb;
547
		bad_s1_mb = TRUE;
548
549
	    }

550
	    s2_mb_len = parse_mbchar(s2, s2_mb, NULL);
551

552
	    if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
553
554
		mbtowc(NULL, NULL, 0);
		ws2 = (unsigned char)*s2_mb;
555
		bad_s2_mb = TRUE;
556
557
	    }

558
559
	    if (n == 0 || bad_s1_mb != bad_s2_mb ||
		towlower(ws1) != towlower(ws2))
560
561
562
563
		break;

	    s1 += s1_mb_len;
	    s2 += s2_mb_len;
564
	    n--;
565
566
567
568
569
	}

	free(s1_mb);
	free(s2_mb);

570
	return towlower(ws1) - towlower(ws2);
571
572
    } else
#endif
573
	return strncasecmp(s1, s2, n);
574
575
576
}

#ifndef HAVE_STRCASESTR
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
577
/* This function, nstrcasestr() (originally mutt_stristr()), was adapted
578
579
 * from mutt 1.2.4i (lib.c).  Here is the notice from that file, with
 * the Free Software Foundation's address updated:
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
 *
 * Copyright (C) 1996-2000 Michael R. Elkins <me@cs.hmc.edu>
 * Copyright (C) 1999-2000 Thomas Roessler <roessler@guug.de>
 * 
 *     This program is free software; you can redistribute it
 *     and/or modify it under the terms of the GNU General Public
 *     License as published by the Free Software Foundation; either
 *     version 2 of the License, or (at your option) any later
 *     version.
 * 
 *     This program is distributed in the hope that it will be
 *     useful, but WITHOUT ANY WARRANTY; without even the implied
 *     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *     PURPOSE.  See the GNU General Public License for more
 *     details.
 * 
 *     You should have received a copy of the GNU General Public
 *     License along with this program; if not, write to the Free
598
599
 *     Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 *     Boston, MA  02110-1301, USA. */
600
601

/* This function is equivalent to strcasestr(). */
602
603
604
605
606
const char *nstrcasestr(const char *haystack, const char *needle)
{
    assert(haystack != NULL && needle != NULL);

    for (; *haystack != '\0'; haystack++) {
607
	const char *r = haystack, *q = needle;
608

609
	for (; tolower(*r) == tolower(*q) && *q != '\0'; r++, q++)
610
611
612
613
614
615
616
617
618
619
	    ;

	if (*q == '\0')
	    return haystack;
    }

    return NULL;
}
#endif

620
621
622
/* This function is equivalent to strcasestr() for multibyte strings. */
const char *mbstrcasestr(const char *haystack, const char *needle)
{
623
#ifdef ENABLE_UTF8
624
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
625
	char *r_mb, *q_mb;
626
	wchar_t wr, wq;
627
628
629
630
	bool found_needle = FALSE;

	assert(haystack != NULL && needle != NULL);

631
632
633
	r_mb = charalloc(MB_CUR_MAX);
	q_mb = charalloc(MB_CUR_MAX);

634
	while (*haystack != '\0') {
635
636
	    const char *r = haystack, *q = needle;
	    int r_mb_len, q_mb_len;
637
638

	    while (*q != '\0') {
639
640
		bool bad_r_mb = FALSE, bad_q_mb = FALSE;

641
		r_mb_len = parse_mbchar(r, r_mb, NULL);
642

643
		if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
644
		    mbtowc(NULL, NULL, 0);
645
		    wr = (unsigned char)*r;
646
		    bad_r_mb = TRUE;
647
648
		}

649
		q_mb_len = parse_mbchar(q, q_mb, NULL);
650

651
		if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
652
653
		    mbtowc(NULL, NULL, 0);
		    wq = (unsigned char)*q;
654
		    bad_q_mb = TRUE;
655
656
		}

657
658
		if (bad_r_mb != bad_q_mb ||
			towlower(wr) != towlower(wq))
659
660
		    break;

661
		r += r_mb_len;
662
663
664
665
666
667
668
669
		q += q_mb_len;
	    }

	    if (*q == '\0') {
		found_needle = TRUE;
		break;
	    }

670
	    haystack += move_mbright(haystack, 0);
671
672
	}

673
	free(r_mb);
674
675
	free(q_mb);

676
	return found_needle ? haystack : NULL;
677
678
    } else
#endif
679
	return strcasestr(haystack, needle);
680
681
}

682
#if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP)
683
/* This function is equivalent to strstr(), except in that it scans the
684
 * string in reverse, starting at rev_start. */
685
686
687
688
689
690
691
692
const char *revstrstr(const char *haystack, const char *needle, const
	char *rev_start)
{
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

    for (; rev_start >= haystack; rev_start--) {
	const char *r, *q;

693
	for (r = rev_start, q = needle; *r == *q && *q != '\0'; r++, q++)
694
695
696
697
698
699
700
701
	    ;

	if (*q == '\0')
	    return rev_start;
    }

    return NULL;
}
702
#endif /* !NANO_TINY || !DISABLE_TABCOMP */
703

704
#ifndef NANO_TINY
705
/* This function is equivalent to strcasestr(), except in that it scans
706
 * the string in reverse, starting at rev_start. */
707
708
709
710
711
712
713
714
const char *revstrcasestr(const char *haystack, const char *needle,
	const char *rev_start)
{
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

    for (; rev_start >= haystack; rev_start--) {
	const char *r = rev_start, *q = needle;

715
	for (; tolower(*r) == tolower(*q) && *q != '\0'; r++, q++)
716
717
718
719
720
721
722
723
	    ;

	if (*q == '\0')
	    return rev_start;
    }

    return NULL;
}
724
725
726
727
728
729
730

/* This function is equivalent to strcasestr() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
const char *mbrevstrcasestr(const char *haystack, const char *needle,
	const char *rev_start)
{
731
#ifdef ENABLE_UTF8
732
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
733
	char *r_mb, *q_mb;
734
735
736
737
738
	wchar_t wr, wq;
	bool begin_line = FALSE, found_needle = FALSE;

	assert(haystack != NULL && needle != NULL && rev_start != NULL);

739
740
741
	r_mb = charalloc(MB_CUR_MAX);
	q_mb = charalloc(MB_CUR_MAX);

742
743
744
745
746
	while (!begin_line) {
	    const char *r = rev_start, *q = needle;
	    int r_mb_len, q_mb_len;

	    while (*q != '\0') {
747
748
		bool bad_r_mb = FALSE, bad_q_mb = FALSE;

749
		r_mb_len = parse_mbchar(r, r_mb, NULL);
750

751
		if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
752
753
		    mbtowc(NULL, NULL, 0);
		    wr = (unsigned char)*r;
754
		    bad_r_mb = TRUE;
755
756
		}

757
		q_mb_len = parse_mbchar(q, q_mb, NULL);
758

759
		if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
760
761
		    mbtowc(NULL, NULL, 0);
		    wq = (unsigned char)*q;
762
		    bad_q_mb = TRUE;
763
764
		}

765
766
		if (bad_r_mb != bad_q_mb ||
			towlower(wr) != towlower(wq))
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
		    break;

		r += r_mb_len;
		q += q_mb_len;
	    }

	    if (*q == '\0') {
		found_needle = TRUE;
		break;
	    }

	    if (rev_start == haystack)
		begin_line = TRUE;
	    else
		rev_start = haystack + move_mbleft(haystack, rev_start -
			haystack);
	}

	free(r_mb);
	free(q_mb);

788
	return found_needle ? rev_start : NULL;
789
790
791
792
    } else
#endif
	return revstrcasestr(haystack, needle, rev_start);
}
793
#endif /* !NANO_TINY */
794

795
796
797
798
799
800
/* This function is equivalent to strlen() for multibyte strings. */
size_t mbstrlen(const char *s)
{
    return mbstrnlen(s, (size_t)-1);
}

801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
#ifndef HAVE_STRNLEN
/* This function is equivalent to strnlen(). */
size_t nstrnlen(const char *s, size_t maxlen)
{
    size_t n = 0;

    assert(s != NULL);

    for (; maxlen > 0 && *s != '\0'; maxlen--, n++, s++)
	;

    return n;
}
#endif

/* This function is equivalent to strnlen() for multibyte strings. */
size_t mbstrnlen(const char *s, size_t maxlen)
{
    assert(s != NULL);

821
#ifdef ENABLE_UTF8
822
    if (use_utf8) {
823
824
825
826
	size_t n = 0;
	int s_mb_len;

	while (*s != '\0') {
827
	    s_mb_len = parse_mbchar(s, NULL, NULL);
828

829
	    if (maxlen == 0)
830
831
		break;

832
	    maxlen--;
833
834
	    s += s_mb_len;
	    n++;
835
836
	}

837
	return n;
838
839
    } else
#endif
840
	return strnlen(s, maxlen);
841
}
842

843
#if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
844
/* This function is equivalent to strchr() for multibyte strings. */
845
char *mbstrchr(const char *s, const char *c)
846
847
848
849
{
    assert(s != NULL && c != NULL);

#ifdef ENABLE_UTF8
850
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
851
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
852
853
854
855
856
	char *s_mb = charalloc(MB_CUR_MAX);
	const char *q = s;
	wchar_t ws, wc;
	int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);

857
	if (c_mb_len < 0) {
858
859
860
861
862
863
	    mbtowc(NULL, NULL, 0);
	    wc = (unsigned char)*c;
	    bad_c_mb = TRUE;
	}

	while (*s != '\0') {
864
	    int s_mb_len = parse_mbchar(s, s_mb, NULL);
865

866
	    if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
867
868
869
870
871
872
873
874
875
876
877
878
879
880
		mbtowc(NULL, NULL, 0);
		ws = (unsigned char)*s;
		bad_s_mb = TRUE;
	    }

	    if (bad_s_mb == bad_c_mb && ws == wc)
		break;

	    s += s_mb_len;
	    q += s_mb_len;
	}

	free(s_mb);

881
	if (*s == '\0')
882
883
884
885
886
887
888
	    q = NULL;

	return (char *)q;
    } else
#endif
	return strchr(s, *c);
}
889
#endif /* !NANO_TINY || !DISABLE_JUSTIFY */
890

891
892
893
894
895
896
897
#ifndef NANO_TINY
/* This function is equivalent to strpbrk() for multibyte strings. */
char *mbstrpbrk(const char *s, const char *accept)
{
    assert(s != NULL && accept != NULL);

#ifdef ENABLE_UTF8
898
    if (use_utf8) {
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
	while (*s != '\0') {
	    if (mbstrchr(accept, s) != NULL)
		return (char *)s;

	    s += move_mbright(s, 0);
	}

	return NULL;
    } else
#endif
	return strpbrk(s, accept);
}

/* This function is equivalent to strpbrk(), except in that it scans the
 * string in reverse, starting at rev_start. */
char *revstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

    for (; rev_start >= s; rev_start--) {
	const char *q = (*rev_start == '\0') ? NULL : strchr(accept,
		*rev_start);

	if (q != NULL)
	    return (char *)rev_start;
    }

    return NULL;
}

/* This function is equivalent to strpbrk() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
char *mbrevstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

#ifdef ENABLE_UTF8
939
    if (use_utf8) {
940
941
942
	bool begin_line = FALSE;

	while (!begin_line) {
943
944
945
946
947
948
	    const char *q = (*rev_start == '\0') ? NULL :
		mbstrchr(accept, rev_start);

	    if (q != NULL)
		return (char *)rev_start;

949
950
951
952
	    if (rev_start == s)
		begin_line = TRUE;
	    else
		rev_start = s + move_mbleft(s, rev_start - s);
953
954
955
956
957
958
959
960
961
	}

	return NULL;
    } else
#endif
	return revstrpbrk(s, accept, rev_start);
}
#endif /* !NANO_TINY */

962
#if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
/* Return TRUE if the string s contains one or more blank characters,
 * and FALSE otherwise. */
bool has_blank_chars(const char *s)
{
    assert(s != NULL);

    for (; *s != '\0'; s++) {
	if (isblank(*s))
	    return TRUE;
    }

    return FALSE;
}

/* Return TRUE if the multibyte string s contains one or more blank
 * multibyte characters, and FALSE otherwise. */
bool has_blank_mbchars(const char *s)
{
981
    assert(s != NULL);
982

983
#ifdef ENABLE_UTF8
984
    if (use_utf8) {
985
986
987
988
989
990
	char *chr_mb = charalloc(MB_CUR_MAX);
	bool retval = FALSE;

	while (*s != '\0') {
	    int chr_mb_len;

991
	    chr_mb_len = parse_mbchar(s, chr_mb, NULL);
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007

	    if (is_blank_mbchar(chr_mb)) {
		retval = TRUE;
		break;
	    }

	    s += chr_mb_len;
	}

	free(chr_mb);

	return retval;
    } else
#endif
	return has_blank_chars(s);
}
1008
#endif /* ENABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */
1009

1010
#ifdef ENABLE_UTF8
1011
/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
1012
1013
bool is_valid_unicode(wchar_t wc)
{
1014
1015
1016
    return ((0 <= wc && wc <= 0x10FFFF) && (wc <= 0xD7FF || 0xE000 <=
	wc) && (wc <= 0xFDCF || 0xFDF0 <= wc) && ((wc & 0xFFFF) <=
	0xFFFD));
1017
1018
1019
}
#endif

1020
1021
1022
1023
1024
1025
1026
1027
#ifdef ENABLE_NANORC
/* Check if the string s is a valid multibyte string.  Return TRUE if it
 * is, and FALSE otherwise. */
bool is_valid_mbstring(const char *s)
{
    assert(s != NULL);

    return 
1028
#ifdef ENABLE_UTF8
1029
	use_utf8 ?
1030
	(mbstowcs(NULL, s, 0) != (size_t)-1) :
1031
1032
1033
1034
#endif
	TRUE;
}
#endif /* ENABLE_NANORC */