chars.c 23.9 KB
Newer Older
1
2
3
4
/* $Id$ */
/**************************************************************************
 *   chars.c                                                              *
 *                                                                        *
5
 *   Copyright (C) 2001, 2002, 2003, 2004 Chris Allegretta                *
6
 *   Copyright (C) 2005, 2006, 2007 David Lawrence Ramsey                 *
7
8
9
10
11
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
 *   the Free Software Foundation; either version 2, or (at your option)  *
 *   any later version.                                                   *
 *                                                                        *
12
13
14
15
 *   This program is distributed in the hope that it will be useful, but  *
 *   WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    *
 *   General Public License for more details.                             *
16
17
18
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
19
20
 *   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA            *
 *   02110-1301, USA.                                                     *
21
22
23
 *                                                                        *
 **************************************************************************/

24
#include "proto.h"
25

26
#include <string.h>
27
28
#include <ctype.h>

29
#ifdef ENABLE_UTF8
30
#ifdef HAVE_WCHAR_H
31
32
#include <wchar.h>
#endif
33
#ifdef HAVE_WCTYPE_H
34
35
#include <wctype.h>
#endif
36

37
38
static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */
39
40
static const wchar_t bad_wchar = 0xFFFD;
	/* If we get an invalid multibyte sequence, we treat it as
41
42
	 * Unicode FFFD (Replacement Character), unless we're searching
	 * for a match to it. */
43
static const char *const bad_mbchar = "\xEF\xBF\xBD";
44
static const int bad_mbchar_len = 3;
45
46
47
48
49
50
51
52
53
54
55
56

/* Enable UTF-8 support. */
void utf8_init(void)
{
    use_utf8 = TRUE;
}

/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
    return use_utf8;
}
57
#endif
58

59
60
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
61
bool nisblank(int c)
62
{
63
    return isspace(c) && (c == '\t' || !is_cntrl_char(c));
64
}
65
#endif
66

67
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
68
/* This function is equivalent to iswblank(). */
69
bool niswblank(wchar_t wc)
70
{
71
    return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
72
}
73
#endif
74

75
76
77
78
79
80
81
/* Return TRUE if the value of c is in byte range, and FALSE
 * otherwise. */
bool is_byte(int c)
{
    return ((unsigned int)c == (unsigned char)c);
}

82
83
84
85
86
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
    assert(c != NULL);

87
#ifdef ENABLE_UTF8
88
    if (use_utf8) {
89
90
	wchar_t wc;

91
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
92
	    mbtowc(NULL, NULL, 0);
93
	    wc = bad_wchar;
94
95
96
97
98
99
100
101
	}

	return iswalnum(wc);
    } else
#endif
	return isalnum((unsigned char)*c);
}

102
103
104
105
106
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
    assert(c != NULL);

107
#ifdef ENABLE_UTF8
108
    if (use_utf8) {
109
110
	wchar_t wc;

111
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
112
	    mbtowc(NULL, NULL, 0);
113
	    wc = bad_wchar;
114
115
	}

116
	return iswblank(wc);
117
118
    } else
#endif
119
	return isblank((unsigned char)*c);
120
121
}

122
123
124
125
126
127
128
/* This function is equivalent to iscntrl(), except in that it only
 * handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
    return (0 <= c && c < 32);
}

129
/* This function is equivalent to iscntrl(), except in that it also
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
130
 * handles high-bit control characters. */
131
bool is_cntrl_char(int c)
132
{
133
134
    return (-128 <= c && c < -96) || (0 <= c && c < 32) ||
	(127 <= c && c < 160);
135
136
}

137
#ifdef ENABLE_UTF8
138
139
140
/* This function is equivalent to iscntrl() for wide characters, except
 * in that it also handles wide control characters with their high bits
 * set. */
141
bool is_cntrl_wchar(wchar_t wc)
142
{
143
    return (0 <= wc && wc < 32) || (127 <= wc && wc < 160);
144
145
146
}
#endif

147
148
149
150
151
152
153
/* This function is equivalent to iscntrl() for multibyte characters,
 * except in that it also handles multibyte control characters with
 * their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
    assert(c != NULL);

154
#ifdef ENABLE_UTF8
155
    if (use_utf8) {
156
157
	wchar_t wc;

158
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
159
	    mbtowc(NULL, NULL, 0);
160
	    wc = bad_wchar;
161
162
163
164
165
166
167
168
	}

	return is_cntrl_wchar(wc);
    } else
#endif
	return is_cntrl_char((unsigned char)*c);
}

169
170
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
171
172
173
{
    assert(c != NULL);

174
#ifdef ENABLE_UTF8
175
    if (use_utf8) {
176
177
178
	wchar_t wc;
	int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);

179
	if (c_mb_len < 0) {
180
	    mbtowc(NULL, NULL, 0);
181
	    wc = bad_wchar;
182
183
	}

184
	return iswpunct(wc);
185
186
    } else
#endif
187
188
189
190
	return ispunct((unsigned char)*c);
}

/* Return TRUE for a multibyte character found in a word (currently only
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
191
 * an alphanumeric or punctuation character, and only the latter if
192
193
194
195
196
197
198
 * allow_punct is TRUE) and FALSE otherwise. */
bool is_word_mbchar(const char *c, bool allow_punct)
{
    assert(c != NULL);

    return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) :
	FALSE);
199
200
}

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
201
/* c is a control character.  It displays as ^@, ^?, or ^[ch], where ch
202
 * is (c + 64).  We return that character. */
203
char control_rep(char c)
204
{
205
206
    assert(is_cntrl_char(c));

207
208
209
210
211
212
213
214
215
    /* Treat newlines embedded in a line as encoded nulls. */
    if (c == '\n')
	return '@';
    else if (c == NANO_CONTROL_8)
	return '?';
    else
	return c + 64;
}

216
#ifdef ENABLE_UTF8
217
/* c is a wide control character.  It displays as ^@, ^?, or ^[ch],
218
 * where ch is (c + 64).  We return that wide character. */
219
220
wchar_t control_wrep(wchar_t wc)
{
221
222
    assert(is_cntrl_wchar(wc));

223
224
225
226
227
228
229
230
231
232
233
    /* Treat newlines embedded in a line as encoded nulls. */
    if (wc == '\n')
	return '@';
    else if (wc == NANO_CONTROL_8)
	return '?';
    else
	return wc + 64;
}
#endif

/* c is a multibyte control character.  It displays as ^@, ^?, or ^[ch],
234
235
 * where ch is (c + 64).  We return that multibyte character.  If crep
 * is an invalid multibyte sequence, it will be replaced with Unicode
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
236
 * 0xFFFD (Replacement Character). */
237
238
char *control_mbrep(const char *c, char *crep, int *crep_len)
{
239
    assert(c != NULL && crep != NULL && crep_len != NULL);
240

241
#ifdef ENABLE_UTF8
242
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
243
	wchar_t wc;
244

245
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
246
	    mbtowc(NULL, NULL, 0);
247
	    *crep_len = bad_mbchar_len;
248
	    strncpy(crep, bad_mbchar, *crep_len);
249
250
251
252
253
254
255
	} else {
	    *crep_len = wctomb(crep, control_wrep(wc));

	    if (*crep_len < 0) {
		wctomb(NULL, 0);
		*crep_len = 0;
	    }
256
257
258
259
	}
    } else {
#endif
	*crep_len = 1;
260
	*crep = control_rep(*c);
261
#ifdef ENABLE_UTF8
262
263
    }
#endif
264
265

    return crep;
266
267
}

268
/* c is a multibyte non-control character.  We return that multibyte
269
 * character.  If crep is an invalid multibyte sequence, it will be
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
270
 * replaced with Unicode 0xFFFD (Replacement Character). */
271
272
273
274
275
char *mbrep(const char *c, char *crep, int *crep_len)
{
    assert(c != NULL && crep != NULL && crep_len != NULL);

#ifdef ENABLE_UTF8
276
    if (use_utf8) {
277
278
	wchar_t wc;

279
280
	/* Reject invalid Unicode characters. */
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
281
282
	    mbtowc(NULL, NULL, 0);
	    *crep_len = bad_mbchar_len;
283
	    strncpy(crep, bad_mbchar, *crep_len);
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
	} else {
	    *crep_len = wctomb(crep, wc);

	    if (*crep_len < 0) {
		wctomb(NULL, 0);
		*crep_len = 0;
	    }
	}
    } else {
#endif
	*crep_len = 1;
	*crep = *c;
#ifdef ENABLE_UTF8
    }
#endif

    return crep;
}

303
304
305
306
307
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
    assert(c != NULL);

308
#ifdef ENABLE_UTF8
309
    if (use_utf8) {
310
	wchar_t wc;
311
	int width;
312

313
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
314
	    mbtowc(NULL, NULL, 0);
315
	    wc = bad_wchar;
316
317
318
	}

	width = wcwidth(wc);
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
319

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
320
321
322
323
	if (width == -1) {
	    wc = bad_wchar;
	    width = wcwidth(wc);
	}
324
325
326
327
328
329
330
331
332
333

	return width;
    } else
#endif
	return 1;
}

/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
334
    return
335
#ifdef ENABLE_UTF8
336
	use_utf8 ? MB_CUR_MAX :
337
#endif
338
	1;
339
340
}

341
342
/* Convert the Unicode value in chr to a multibyte character with the
 * same wide character value as chr, if possible.  If the conversion
343
344
345
 * succeeds, return the (dynamically allocated) multibyte character and
 * its length.  Otherwise, return an undefined (dynamically allocated)
 * multibyte character and a length of zero. */
346
char *make_mbchar(long chr, int *chr_mb_len)
347
{
348
349
    char *chr_mb;

350
    assert(chr_mb_len != NULL);
351

352
#ifdef ENABLE_UTF8
353
    if (use_utf8) {
354
	chr_mb = charalloc(MB_CUR_MAX);
355
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
356

357
358
	/* Reject invalid Unicode characters. */
	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
359
360
	    wctomb(NULL, 0);
	    *chr_mb_len = 0;
361
362
363
364
	}
    } else {
#endif
	*chr_mb_len = 1;
365
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
366
#ifdef ENABLE_UTF8
367
368
369
370
371
372
373
374
    }
#endif

    return chr_mb;
}

/* Parse a multibyte character from buf.  Return the number of bytes
 * used.  If chr isn't NULL, store the multibyte character in it.  If
375
376
377
 * col isn't NULL, store the new display width in it.  If *buf is '\t',
 * we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
378
379
380
381
382
{
    int buf_mb_len;

    assert(buf != NULL);

383
#ifdef ENABLE_UTF8
384
    if (use_utf8) {
385
386
387
	/* Get the number of bytes in the multibyte character. */
	buf_mb_len = mblen(buf, MB_CUR_MAX);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
388
389
	/* If buf contains an invalid multibyte character, only
	 * interpret buf's first byte. */
390
	if (buf_mb_len < 0) {
391
	    mblen(NULL, 0);
392
	    buf_mb_len = 1;
393
394
	} else if (buf_mb_len == 0)
	    buf_mb_len++;
395
396
397
398

	/* Save the multibyte character in chr. */
	if (chr != NULL) {
	    int i;
399

400
401
402
403
404
405
406
407
408
409
410
411
412
	    for (i = 0; i < buf_mb_len; i++)
		chr[i] = buf[i];
	}

	/* Save the column width of the wide character in col. */
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, get its width using one
	     * column for the "^" that will be displayed in front of it,
	     * and the width in columns of its visible equivalent as
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
413
	     * returned by control_mbrep(). */
414
	    else if (is_cntrl_mbchar(buf)) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
415
		char *ctrl_buf_mb = charalloc(MB_CUR_MAX);
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
		int ctrl_buf_mb_len;

		(*col)++;

		ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb,
			&ctrl_buf_mb_len);

		*col += mbwidth(ctrl_buf_mb);

		free(ctrl_buf_mb);
	    /* If we have a normal character, get its width in columns
	     * normally. */
	    } else
		*col += mbwidth(buf);
	}
    } else {
#endif
	/* Get the number of bytes in the byte character. */
	buf_mb_len = 1;

	/* Save the byte character in chr. */
	if (chr != NULL)
	    *chr = *buf;

	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, it's two columns wide:
	     * one column for the "^" that will be displayed in front of
	     * it, and one column for its visible equivalent as returned
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
448
	     * by control_mbrep(). */
449
450
451
452
453
454
	    else if (is_cntrl_char((unsigned char)*buf))
		*col += 2;
	    /* If we have a normal character, it's one column wide. */
	    else
		(*col)++;
	}
455
#ifdef ENABLE_UTF8
456
457
458
459
460
    }
#endif

    return buf_mb_len;
}
461
462
463
464
465
466
467

/* Return the index in buf of the beginning of the multibyte character
 * before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
    size_t pos_prev = pos;

468
    assert(buf != NULL && pos <= strlen(buf));
469
470
471
472

    /* There is no library function to move backward one multibyte
     * character.  Here is the naive, O(pos) way to do it. */
    while (TRUE) {
473
	int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL);
474

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
475
	if (pos_prev <= buf_mb_len)
476
477
478
479
480
481
482
483
484
485
486
487
	    break;

	pos_prev -= buf_mb_len;
    }

    return pos - pos_prev;
}

/* Return the index in buf of the beginning of the multibyte character
 * after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
488
    return pos + parse_mbchar(buf + pos, NULL, NULL);
489
}
490
491
492
493
494

#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
495
    return strncasecmp(s1, s2, (size_t)-1);
496
497
498
499
500
501
}
#endif

/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
502
    return mbstrncasecmp(s1, s2, (size_t)-1);
503
504
505
506
507
508
509
510
511
512
513
514
515
}

#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
    assert(s1 != NULL && s2 != NULL);

    for (; n > 0 && *s1 != '\0' && *s2 != '\0'; n--, s1++, s2++) {
	if (tolower(*s1) != tolower(*s2))
	    break;
    }

516
    return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
517
518
519
520
521
522
523
}
#endif

/* This function is equivalent to strncasecmp() for multibyte
 * strings. */
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
524
#ifdef ENABLE_UTF8
525
    if (use_utf8) {
526
	char *s1_mb, *s2_mb;
527
528
	wchar_t ws1, ws2;

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
529
530
	assert(s1 != NULL && s2 != NULL);

531
532
533
	s1_mb = charalloc(MB_CUR_MAX);
	s2_mb = charalloc(MB_CUR_MAX);

534
	while (n > 0 && *s1 != '\0' && *s2 != '\0') {
535
	    bool bad_s1_mb = FALSE, bad_s2_mb = FALSE;
536
537
	    int s1_mb_len, s2_mb_len;

538
	    s1_mb_len = parse_mbchar(s1, s1_mb, NULL);
539

540
	    if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
541
542
		mbtowc(NULL, NULL, 0);
		ws1 = (unsigned char)*s1_mb;
543
		bad_s1_mb = TRUE;
544
545
	    }

546
	    s2_mb_len = parse_mbchar(s2, s2_mb, NULL);
547

548
	    if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
549
550
		mbtowc(NULL, NULL, 0);
		ws2 = (unsigned char)*s2_mb;
551
		bad_s2_mb = TRUE;
552
553
	    }

554
555
	    if (n == 0 || bad_s1_mb != bad_s2_mb ||
		towlower(ws1) != towlower(ws2))
556
557
558
559
		break;

	    s1 += s1_mb_len;
	    s2 += s2_mb_len;
560
	    n--;
561
562
563
564
565
	}

	free(s1_mb);
	free(s2_mb);

566
	return towlower(ws1) - towlower(ws2);
567
568
    } else
#endif
569
	return strncasecmp(s1, s2, n);
570
571
572
}

#ifndef HAVE_STRCASESTR
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
573
/* This function, nstrcasestr() (originally mutt_stristr()), was adapted
574
575
 * from mutt 1.2.4i (lib.c).  Here is the notice from that file, with
 * the Free Software Foundation's address updated:
576
 *
577
578
579
 * Copyright (C) 1996, 1997, 1998, 1999, 2000 Michael R. Elkins
 * <me@cs.hmc.edu>
 * Copyright (C) 1999, 2000 Thomas Roessler <roessler@guug.de>
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
 * 
 *     This program is free software; you can redistribute it
 *     and/or modify it under the terms of the GNU General Public
 *     License as published by the Free Software Foundation; either
 *     version 2 of the License, or (at your option) any later
 *     version.
 * 
 *     This program is distributed in the hope that it will be
 *     useful, but WITHOUT ANY WARRANTY; without even the implied
 *     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *     PURPOSE.  See the GNU General Public License for more
 *     details.
 * 
 *     You should have received a copy of the GNU General Public
 *     License along with this program; if not, write to the Free
595
596
 *     Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 *     Boston, MA  02110-1301, USA. */
597
598

/* This function is equivalent to strcasestr(). */
599
600
601
602
603
const char *nstrcasestr(const char *haystack, const char *needle)
{
    assert(haystack != NULL && needle != NULL);

    for (; *haystack != '\0'; haystack++) {
604
	const char *r = haystack, *q = needle;
605

606
	for (; tolower(*r) == tolower(*q) && *q != '\0'; r++, q++)
607
608
609
610
611
612
613
614
615
616
	    ;

	if (*q == '\0')
	    return haystack;
    }

    return NULL;
}
#endif

617
618
619
/* This function is equivalent to strcasestr() for multibyte strings. */
const char *mbstrcasestr(const char *haystack, const char *needle)
{
620
#ifdef ENABLE_UTF8
621
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
622
	char *r_mb, *q_mb;
623
	wchar_t wr, wq;
624
625
626
627
	bool found_needle = FALSE;

	assert(haystack != NULL && needle != NULL);

628
629
630
	r_mb = charalloc(MB_CUR_MAX);
	q_mb = charalloc(MB_CUR_MAX);

631
	while (*haystack != '\0') {
632
633
	    const char *r = haystack, *q = needle;
	    int r_mb_len, q_mb_len;
634
635

	    while (*q != '\0') {
636
637
		bool bad_r_mb = FALSE, bad_q_mb = FALSE;

638
		r_mb_len = parse_mbchar(r, r_mb, NULL);
639

640
		if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
641
		    mbtowc(NULL, NULL, 0);
642
		    wr = (unsigned char)*r;
643
		    bad_r_mb = TRUE;
644
645
		}

646
		q_mb_len = parse_mbchar(q, q_mb, NULL);
647

648
		if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
649
650
		    mbtowc(NULL, NULL, 0);
		    wq = (unsigned char)*q;
651
		    bad_q_mb = TRUE;
652
653
		}

654
655
		if (bad_r_mb != bad_q_mb ||
			towlower(wr) != towlower(wq))
656
657
		    break;

658
		r += r_mb_len;
659
660
661
662
663
664
665
666
		q += q_mb_len;
	    }

	    if (*q == '\0') {
		found_needle = TRUE;
		break;
	    }

667
	    haystack += move_mbright(haystack, 0);
668
669
	}

670
	free(r_mb);
671
672
	free(q_mb);

673
	return found_needle ? haystack : NULL;
674
675
    } else
#endif
676
	return strcasestr(haystack, needle);
677
678
}

679
#if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP)
680
/* This function is equivalent to strstr(), except in that it scans the
681
 * string in reverse, starting at rev_start. */
682
683
684
685
686
687
688
689
const char *revstrstr(const char *haystack, const char *needle, const
	char *rev_start)
{
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

    for (; rev_start >= haystack; rev_start--) {
	const char *r, *q;

690
	for (r = rev_start, q = needle; *r == *q && *q != '\0'; r++, q++)
691
692
693
694
695
696
697
698
	    ;

	if (*q == '\0')
	    return rev_start;
    }

    return NULL;
}
699
#endif /* !NANO_TINY || !DISABLE_TABCOMP */
700

701
#ifndef NANO_TINY
702
/* This function is equivalent to strcasestr(), except in that it scans
703
 * the string in reverse, starting at rev_start. */
704
705
706
707
708
709
710
711
const char *revstrcasestr(const char *haystack, const char *needle,
	const char *rev_start)
{
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

    for (; rev_start >= haystack; rev_start--) {
	const char *r = rev_start, *q = needle;

712
	for (; tolower(*r) == tolower(*q) && *q != '\0'; r++, q++)
713
714
715
716
717
718
719
720
	    ;

	if (*q == '\0')
	    return rev_start;
    }

    return NULL;
}
721
722
723
724
725
726
727

/* This function is equivalent to strcasestr() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
const char *mbrevstrcasestr(const char *haystack, const char *needle,
	const char *rev_start)
{
728
#ifdef ENABLE_UTF8
729
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
730
	char *r_mb, *q_mb;
731
732
733
734
735
	wchar_t wr, wq;
	bool begin_line = FALSE, found_needle = FALSE;

	assert(haystack != NULL && needle != NULL && rev_start != NULL);

736
737
738
	r_mb = charalloc(MB_CUR_MAX);
	q_mb = charalloc(MB_CUR_MAX);

739
740
741
742
743
	while (!begin_line) {
	    const char *r = rev_start, *q = needle;
	    int r_mb_len, q_mb_len;

	    while (*q != '\0') {
744
745
		bool bad_r_mb = FALSE, bad_q_mb = FALSE;

746
		r_mb_len = parse_mbchar(r, r_mb, NULL);
747

748
		if (mbtowc(&wr, r_mb, r_mb_len) < 0) {
749
750
		    mbtowc(NULL, NULL, 0);
		    wr = (unsigned char)*r;
751
		    bad_r_mb = TRUE;
752
753
		}

754
		q_mb_len = parse_mbchar(q, q_mb, NULL);
755

756
		if (mbtowc(&wq, q_mb, q_mb_len) < 0) {
757
758
		    mbtowc(NULL, NULL, 0);
		    wq = (unsigned char)*q;
759
		    bad_q_mb = TRUE;
760
761
		}

762
763
		if (bad_r_mb != bad_q_mb ||
			towlower(wr) != towlower(wq))
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
		    break;

		r += r_mb_len;
		q += q_mb_len;
	    }

	    if (*q == '\0') {
		found_needle = TRUE;
		break;
	    }

	    if (rev_start == haystack)
		begin_line = TRUE;
	    else
		rev_start = haystack + move_mbleft(haystack, rev_start -
			haystack);
	}

	free(r_mb);
	free(q_mb);

785
	return found_needle ? rev_start : NULL;
786
787
788
789
    } else
#endif
	return revstrcasestr(haystack, needle, rev_start);
}
790
#endif /* !NANO_TINY */
791

792
793
794
795
796
797
/* This function is equivalent to strlen() for multibyte strings. */
size_t mbstrlen(const char *s)
{
    return mbstrnlen(s, (size_t)-1);
}

798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
#ifndef HAVE_STRNLEN
/* This function is equivalent to strnlen(). */
size_t nstrnlen(const char *s, size_t maxlen)
{
    size_t n = 0;

    assert(s != NULL);

    for (; maxlen > 0 && *s != '\0'; maxlen--, n++, s++)
	;

    return n;
}
#endif

/* This function is equivalent to strnlen() for multibyte strings. */
size_t mbstrnlen(const char *s, size_t maxlen)
{
    assert(s != NULL);

818
#ifdef ENABLE_UTF8
819
    if (use_utf8) {
820
821
822
823
	size_t n = 0;
	int s_mb_len;

	while (*s != '\0') {
824
	    s_mb_len = parse_mbchar(s, NULL, NULL);
825

826
	    if (maxlen == 0)
827
828
		break;

829
	    maxlen--;
830
831
	    s += s_mb_len;
	    n++;
832
833
	}

834
	return n;
835
836
    } else
#endif
837
	return strnlen(s, maxlen);
838
}
839

840
#if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
841
/* This function is equivalent to strchr() for multibyte strings. */
842
char *mbstrchr(const char *s, const char *c)
843
844
845
846
{
    assert(s != NULL && c != NULL);

#ifdef ENABLE_UTF8
847
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
848
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
849
850
851
852
853
	char *s_mb = charalloc(MB_CUR_MAX);
	const char *q = s;
	wchar_t ws, wc;
	int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);

854
	if (c_mb_len < 0) {
855
856
857
858
859
860
	    mbtowc(NULL, NULL, 0);
	    wc = (unsigned char)*c;
	    bad_c_mb = TRUE;
	}

	while (*s != '\0') {
861
	    int s_mb_len = parse_mbchar(s, s_mb, NULL);
862

863
	    if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
864
865
866
867
868
869
870
871
872
873
874
875
876
877
		mbtowc(NULL, NULL, 0);
		ws = (unsigned char)*s;
		bad_s_mb = TRUE;
	    }

	    if (bad_s_mb == bad_c_mb && ws == wc)
		break;

	    s += s_mb_len;
	    q += s_mb_len;
	}

	free(s_mb);

878
	if (*s == '\0')
879
880
881
882
883
884
885
	    q = NULL;

	return (char *)q;
    } else
#endif
	return strchr(s, *c);
}
886
#endif /* !NANO_TINY || !DISABLE_JUSTIFY */
887

888
889
890
891
892
893
894
#ifndef NANO_TINY
/* This function is equivalent to strpbrk() for multibyte strings. */
char *mbstrpbrk(const char *s, const char *accept)
{
    assert(s != NULL && accept != NULL);

#ifdef ENABLE_UTF8
895
    if (use_utf8) {
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
	while (*s != '\0') {
	    if (mbstrchr(accept, s) != NULL)
		return (char *)s;

	    s += move_mbright(s, 0);
	}

	return NULL;
    } else
#endif
	return strpbrk(s, accept);
}

/* This function is equivalent to strpbrk(), except in that it scans the
 * string in reverse, starting at rev_start. */
char *revstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

    for (; rev_start >= s; rev_start--) {
	const char *q = (*rev_start == '\0') ? NULL : strchr(accept,
		*rev_start);

	if (q != NULL)
	    return (char *)rev_start;
    }

    return NULL;
}

/* This function is equivalent to strpbrk() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
char *mbrevstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

#ifdef ENABLE_UTF8
936
    if (use_utf8) {
937
938
939
	bool begin_line = FALSE;

	while (!begin_line) {
940
941
942
943
944
945
	    const char *q = (*rev_start == '\0') ? NULL :
		mbstrchr(accept, rev_start);

	    if (q != NULL)
		return (char *)rev_start;

946
947
948
949
	    if (rev_start == s)
		begin_line = TRUE;
	    else
		rev_start = s + move_mbleft(s, rev_start - s);
950
951
952
953
954
955
956
957
958
	}

	return NULL;
    } else
#endif
	return revstrpbrk(s, accept, rev_start);
}
#endif /* !NANO_TINY */

959
#if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
/* Return TRUE if the string s contains one or more blank characters,
 * and FALSE otherwise. */
bool has_blank_chars(const char *s)
{
    assert(s != NULL);

    for (; *s != '\0'; s++) {
	if (isblank(*s))
	    return TRUE;
    }

    return FALSE;
}

/* Return TRUE if the multibyte string s contains one or more blank
 * multibyte characters, and FALSE otherwise. */
bool has_blank_mbchars(const char *s)
{
978
    assert(s != NULL);
979

980
#ifdef ENABLE_UTF8
981
    if (use_utf8) {
982
983
984
985
986
987
	char *chr_mb = charalloc(MB_CUR_MAX);
	bool retval = FALSE;

	while (*s != '\0') {
	    int chr_mb_len;

988
	    chr_mb_len = parse_mbchar(s, chr_mb, NULL);
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004

	    if (is_blank_mbchar(chr_mb)) {
		retval = TRUE;
		break;
	    }

	    s += chr_mb_len;
	}

	free(chr_mb);

	return retval;
    } else
#endif
	return has_blank_chars(s);
}
1005
#endif /* ENABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */
1006

1007
#ifdef ENABLE_UTF8
1008
/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
1009
1010
bool is_valid_unicode(wchar_t wc)
{
1011
1012
1013
    return ((0 <= wc && wc <= 0x10FFFF) && (wc <= 0xD7FF || 0xE000 <=
	wc) && (wc <= 0xFDCF || 0xFDF0 <= wc) && ((wc & 0xFFFF) <=
	0xFFFD));
1014
1015
1016
}
#endif

1017
1018
1019
1020
1021
1022
1023
1024
#ifdef ENABLE_NANORC
/* Check if the string s is a valid multibyte string.  Return TRUE if it
 * is, and FALSE otherwise. */
bool is_valid_mbstring(const char *s)
{
    assert(s != NULL);

    return 
1025
#ifdef ENABLE_UTF8
1026
	use_utf8 ? (mbstowcs(NULL, s, 0) != (size_t)-1) :
1027
1028
1029
1030
#endif
	TRUE;
}
#endif /* ENABLE_NANORC */