chars.c 22.4 KB
Newer Older
1
2
3
4
/* $Id$ */
/**************************************************************************
 *   chars.c                                                              *
 *                                                                        *
5
 *   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009   *
6
 *   Free Software Foundation, Inc.                                       *
7
8
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
9
 *   the Free Software Foundation; either version 3, or (at your option)  *
10
11
 *   any later version.                                                   *
 *                                                                        *
12
13
14
15
 *   This program is distributed in the hope that it will be useful, but  *
 *   WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    *
 *   General Public License for more details.                             *
16
17
18
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
19
20
 *   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA            *
 *   02110-1301, USA.                                                     *
21
22
23
 *                                                                        *
 **************************************************************************/

24
#include "proto.h"
25

26
#include <string.h>
27
28
#include <ctype.h>

29
#ifdef ENABLE_UTF8
30
#ifdef HAVE_WCHAR_H
31
32
#include <wchar.h>
#endif
33
#ifdef HAVE_WCTYPE_H
34
35
#include <wctype.h>
#endif
36

37
38
static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */
39
40
static const wchar_t bad_wchar = 0xFFFD;
	/* If we get an invalid multibyte sequence, we treat it as
41
42
	 * Unicode FFFD (Replacement Character), unless we're searching
	 * for a match to it. */
43
static const char *const bad_mbchar = "\xEF\xBF\xBD";
44
static const int bad_mbchar_len = 3;
45
46
47
48
49
50
51
52
53
54
55
56

/* Enable UTF-8 support. */
void utf8_init(void)
{
    use_utf8 = TRUE;
}

/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
    return use_utf8;
}
57
#endif
58

59
60
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
61
bool nisblank(int c)
62
{
63
    return isspace(c) && (c == '\t' || !is_cntrl_char(c));
64
}
65
#endif
66

67
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
68
/* This function is equivalent to iswblank(). */
69
bool niswblank(wchar_t wc)
70
{
71
    return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
72
}
73
#endif
74

75
76
77
78
79
80
81
/* Return TRUE if the value of c is in byte range, and FALSE
 * otherwise. */
bool is_byte(int c)
{
    return ((unsigned int)c == (unsigned char)c);
}

82
void mbtowc_reset(void)
83
84
85
86
{
    IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
}

87
void wctomb_reset(void)
88
89
90
91
{
    IGNORE_CALL_RESULT(wctomb(NULL, 0));
}

92
93
94
95
96
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
    assert(c != NULL);

97
#ifdef ENABLE_UTF8
98
    if (use_utf8) {
99
100
	wchar_t wc;

101
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
102
	    mbtowc_reset();
103
	    wc = bad_wchar;
104
105
106
107
108
109
110
111
	}

	return iswalnum(wc);
    } else
#endif
	return isalnum((unsigned char)*c);
}

112
113
114
115
116
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
    assert(c != NULL);

117
#ifdef ENABLE_UTF8
118
    if (use_utf8) {
119
120
	wchar_t wc;

121
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
122
	    mbtowc_reset();
123
	    wc = bad_wchar;
124
125
	}

126
	return iswblank(wc);
127
128
    } else
#endif
129
	return isblank((unsigned char)*c);
130
131
}

132
133
134
135
136
137
138
/* This function is equivalent to iscntrl(), except in that it only
 * handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
    return (0 <= c && c < 32);
}

139
/* This function is equivalent to iscntrl(), except in that it also
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
140
 * handles high-bit control characters. */
141
bool is_cntrl_char(int c)
142
{
143
144
    return (-128 <= c && c < -96) || (0 <= c && c < 32) ||
	(127 <= c && c < 160);
145
146
}

147
#ifdef ENABLE_UTF8
148
149
150
/* This function is equivalent to iscntrl() for wide characters, except
 * in that it also handles wide control characters with their high bits
 * set. */
151
bool is_cntrl_wchar(wchar_t wc)
152
{
153
    return (0 <= wc && wc < 32) || (127 <= wc && wc < 160);
154
155
156
}
#endif

157
158
159
160
161
162
163
/* This function is equivalent to iscntrl() for multibyte characters,
 * except in that it also handles multibyte control characters with
 * their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
    assert(c != NULL);

164
#ifdef ENABLE_UTF8
165
    if (use_utf8) {
166
167
	wchar_t wc;

168
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
169
	    mbtowc_reset();
170
	    wc = bad_wchar;
171
172
173
174
175
176
177
178
	}

	return is_cntrl_wchar(wc);
    } else
#endif
	return is_cntrl_char((unsigned char)*c);
}

179
180
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
181
182
183
{
    assert(c != NULL);

184
#ifdef ENABLE_UTF8
185
    if (use_utf8) {
186
187
188
	wchar_t wc;
	int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);

189
	if (c_mb_len < 0) {
190
	    mbtowc_reset();
191
	    wc = bad_wchar;
192
193
	}

194
	return iswpunct(wc);
195
196
    } else
#endif
197
198
199
200
	return ispunct((unsigned char)*c);
}

/* Return TRUE for a multibyte character found in a word (currently only
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
201
 * an alphanumeric or punctuation character, and only the latter if
202
203
204
205
206
207
208
 * allow_punct is TRUE) and FALSE otherwise. */
bool is_word_mbchar(const char *c, bool allow_punct)
{
    assert(c != NULL);

    return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) :
	FALSE);
209
210
}

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
211
/* c is a control character.  It displays as ^@, ^?, or ^[ch], where ch
212
 * is (c + 64).  We return that character. */
213
char control_rep(char c)
214
{
215
216
    assert(is_cntrl_char(c));

217
218
219
220
221
222
223
224
225
    /* Treat newlines embedded in a line as encoded nulls. */
    if (c == '\n')
	return '@';
    else if (c == NANO_CONTROL_8)
	return '?';
    else
	return c + 64;
}

226
#ifdef ENABLE_UTF8
227
/* c is a wide control character.  It displays as ^@, ^?, or ^[ch],
228
 * where ch is (c + 64).  We return that wide character. */
229
230
wchar_t control_wrep(wchar_t wc)
{
231
232
    assert(is_cntrl_wchar(wc));

233
234
235
236
237
238
239
240
241
242
243
    /* Treat newlines embedded in a line as encoded nulls. */
    if (wc == '\n')
	return '@';
    else if (wc == NANO_CONTROL_8)
	return '?';
    else
	return wc + 64;
}
#endif

/* c is a multibyte control character.  It displays as ^@, ^?, or ^[ch],
244
245
 * where ch is (c + 64).  We return that multibyte character.  If crep
 * is an invalid multibyte sequence, it will be replaced with Unicode
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
246
 * 0xFFFD (Replacement Character). */
247
248
char *control_mbrep(const char *c, char *crep, int *crep_len)
{
249
    assert(c != NULL && crep != NULL && crep_len != NULL);
250

251
#ifdef ENABLE_UTF8
252
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
253
	wchar_t wc;
254

255
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
256
	    mbtowc_reset();
257
	    *crep_len = bad_mbchar_len;
258
	    strncpy(crep, bad_mbchar, *crep_len);
259
260
261
262
	} else {
	    *crep_len = wctomb(crep, control_wrep(wc));

	    if (*crep_len < 0) {
263
		wctomb_reset();
264
265
		*crep_len = 0;
	    }
266
267
268
269
	}
    } else {
#endif
	*crep_len = 1;
270
	*crep = control_rep(*c);
271
#ifdef ENABLE_UTF8
272
273
    }
#endif
274
275

    return crep;
276
277
}

278
/* c is a multibyte non-control character.  We return that multibyte
279
 * character.  If crep is an invalid multibyte sequence, it will be
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
280
 * replaced with Unicode 0xFFFD (Replacement Character). */
281
282
283
284
285
char *mbrep(const char *c, char *crep, int *crep_len)
{
    assert(c != NULL && crep != NULL && crep_len != NULL);

#ifdef ENABLE_UTF8
286
    if (use_utf8) {
287
288
	wchar_t wc;

289
290
	/* Reject invalid Unicode characters. */
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
291
	    mbtowc_reset();
292
	    *crep_len = bad_mbchar_len;
293
	    strncpy(crep, bad_mbchar, *crep_len);
294
295
296
297
	} else {
	    *crep_len = wctomb(crep, wc);

	    if (*crep_len < 0) {
298
		wctomb_reset();
299
300
301
302
303
304
305
306
307
308
309
310
311
312
		*crep_len = 0;
	    }
	}
    } else {
#endif
	*crep_len = 1;
	*crep = *c;
#ifdef ENABLE_UTF8
    }
#endif

    return crep;
}

313
314
315
316
317
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
    assert(c != NULL);

318
#ifdef ENABLE_UTF8
319
    if (use_utf8) {
320
	wchar_t wc;
321
	int width;
322

323
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
324
	    mbtowc_reset();
325
	    wc = bad_wchar;
326
327
328
	}

	width = wcwidth(wc);
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
329

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
330
331
332
333
	if (width == -1) {
	    wc = bad_wchar;
	    width = wcwidth(wc);
	}
334
335
336
337
338
339
340
341
342
343

	return width;
    } else
#endif
	return 1;
}

/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
344
    return
345
#ifdef ENABLE_UTF8
346
	use_utf8 ? MB_CUR_MAX :
347
#endif
348
	1;
349
350
}

351
352
/* Convert the Unicode value in chr to a multibyte character with the
 * same wide character value as chr, if possible.  If the conversion
353
354
355
 * succeeds, return the (dynamically allocated) multibyte character and
 * its length.  Otherwise, return an undefined (dynamically allocated)
 * multibyte character and a length of zero. */
356
char *make_mbchar(long chr, int *chr_mb_len)
357
{
358
359
    char *chr_mb;

360
    assert(chr_mb_len != NULL);
361

362
#ifdef ENABLE_UTF8
363
    if (use_utf8) {
364
	chr_mb = charalloc(MB_CUR_MAX);
365
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
366

367
368
	/* Reject invalid Unicode characters. */
	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
369
	    wctomb_reset();
370
	    *chr_mb_len = 0;
371
372
373
374
	}
    } else {
#endif
	*chr_mb_len = 1;
375
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
376
#ifdef ENABLE_UTF8
377
378
379
380
381
382
383
384
    }
#endif

    return chr_mb;
}

/* Parse a multibyte character from buf.  Return the number of bytes
 * used.  If chr isn't NULL, store the multibyte character in it.  If
385
386
387
 * col isn't NULL, store the new display width in it.  If *buf is '\t',
 * we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
388
389
390
391
392
{
    int buf_mb_len;

    assert(buf != NULL);

393
#ifdef ENABLE_UTF8
394
    if (use_utf8) {
395
396
397
	/* Get the number of bytes in the multibyte character. */
	buf_mb_len = mblen(buf, MB_CUR_MAX);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
398
399
	/* If buf contains an invalid multibyte character, only
	 * interpret buf's first byte. */
400
	if (buf_mb_len < 0) {
401
	    IGNORE_CALL_RESULT(mblen(NULL, 0));
402
	    buf_mb_len = 1;
403
404
	} else if (buf_mb_len == 0)
	    buf_mb_len++;
405
406
407
408

	/* Save the multibyte character in chr. */
	if (chr != NULL) {
	    int i;
409

410
411
412
413
414
415
416
417
418
419
420
421
422
	    for (i = 0; i < buf_mb_len; i++)
		chr[i] = buf[i];
	}

	/* Save the column width of the wide character in col. */
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, get its width using one
	     * column for the "^" that will be displayed in front of it,
	     * and the width in columns of its visible equivalent as
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
423
	     * returned by control_mbrep(). */
424
	    else if (is_cntrl_mbchar(buf)) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
425
		char *ctrl_buf_mb = charalloc(MB_CUR_MAX);
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
		int ctrl_buf_mb_len;

		(*col)++;

		ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb,
			&ctrl_buf_mb_len);

		*col += mbwidth(ctrl_buf_mb);

		free(ctrl_buf_mb);
	    /* If we have a normal character, get its width in columns
	     * normally. */
	    } else
		*col += mbwidth(buf);
	}
    } else {
#endif
	/* Get the number of bytes in the byte character. */
	buf_mb_len = 1;

	/* Save the byte character in chr. */
	if (chr != NULL)
	    *chr = *buf;

	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, it's two columns wide:
	     * one column for the "^" that will be displayed in front of
	     * it, and one column for its visible equivalent as returned
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
458
	     * by control_mbrep(). */
459
460
461
462
463
464
	    else if (is_cntrl_char((unsigned char)*buf))
		*col += 2;
	    /* If we have a normal character, it's one column wide. */
	    else
		(*col)++;
	}
465
#ifdef ENABLE_UTF8
466
467
468
469
470
    }
#endif

    return buf_mb_len;
}
471
472
473
474
475
476
477

/* Return the index in buf of the beginning of the multibyte character
 * before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
    size_t pos_prev = pos;

478
    assert(buf != NULL && pos <= strlen(buf));
479
480
481
482

    /* There is no library function to move backward one multibyte
     * character.  Here is the naive, O(pos) way to do it. */
    while (TRUE) {
483
	int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL);
484

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
485
	if (pos_prev <= buf_mb_len)
486
487
488
489
490
491
492
493
494
495
496
497
	    break;

	pos_prev -= buf_mb_len;
    }

    return pos - pos_prev;
}

/* Return the index in buf of the beginning of the multibyte character
 * after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
498
    return pos + parse_mbchar(buf + pos, NULL, NULL);
499
}
500
501
502
503
504

#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
505
    return strncasecmp(s1, s2, (size_t)-1);
506
507
508
509
510
511
}
#endif

/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
512
    return mbstrncasecmp(s1, s2, (size_t)-1);
513
514
515
516
517
518
}

#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
519
520
521
    if (s1 == s2)
	return 0;

522
523
    assert(s1 != NULL && s2 != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
524
    for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) {
525
526
527
528
	if (tolower(*s1) != tolower(*s2))
	    break;
    }

529
    return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
530
531
532
533
534
535
536
}
#endif

/* This function is equivalent to strncasecmp() for multibyte
 * strings. */
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
537
#ifdef ENABLE_UTF8
538
    if (use_utf8) {
539
	char *s1_mb, *s2_mb;
540
541
	wchar_t ws1, ws2;

542
543
544
	if (s1 == s2)
	    return 0;

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
545
546
	assert(s1 != NULL && s2 != NULL);

547
548
549
	s1_mb = charalloc(MB_CUR_MAX);
	s2_mb = charalloc(MB_CUR_MAX);

550
551
	for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1 +=
		move_mbright(s1, 0), s2 += move_mbright(s2, 0), n--) {
552
	    bool bad_s1_mb = FALSE, bad_s2_mb = FALSE;
553
554
	    int s1_mb_len, s2_mb_len;

555
	    s1_mb_len = parse_mbchar(s1, s1_mb, NULL);
556

557
	    if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
558
		mbtowc_reset();
559
		ws1 = (unsigned char)*s1_mb;
560
		bad_s1_mb = TRUE;
561
562
	    }

563
	    s2_mb_len = parse_mbchar(s2, s2_mb, NULL);
564

565
	    if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
566
		mbtowc_reset();
567
		ws2 = (unsigned char)*s2_mb;
568
		bad_s2_mb = TRUE;
569
570
	    }

571
572
	    if (bad_s1_mb != bad_s2_mb || towlower(ws1) !=
		towlower(ws2))
573
574
575
576
577
578
		break;
	}

	free(s1_mb);
	free(s2_mb);

579
	return (n > 0) ? towlower(ws1) - towlower(ws2) : 0;
580
581
    } else
#endif
582
	return strncasecmp(s1, s2, n);
583
584
585
}

#ifndef HAVE_STRCASESTR
586
/* This function is equivalent to strcasestr(). */
587
char *nstrcasestr(const char *haystack, const char *needle)
588
{
589
590
    size_t haystack_len, needle_len;

591
592
    assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
593
    if (*needle == '\0')
594
	return (char *)haystack;
595

596
597
    haystack_len = strlen(haystack);
    needle_len = strlen(needle);
598

599
600
601
    for (; *haystack != '\0' && haystack_len >= needle_len; haystack++,
	haystack_len--) {
	if (strncasecmp(haystack, needle, needle_len) == 0)
602
	    return (char *)haystack;
603
604
605
606
607
608
    }

    return NULL;
}
#endif

609
/* This function is equivalent to strcasestr() for multibyte strings. */
610
char *mbstrcasestr(const char *haystack, const char *needle)
611
{
612
#ifdef ENABLE_UTF8
613
    if (use_utf8) {
614
	size_t haystack_len, needle_len;
615
616
617

	assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
618
	if (*needle == '\0')
619
	    return (char *)haystack;
620

621
622
	haystack_len = mbstrlen(haystack);
	needle_len = mbstrlen(needle);
623

624
625
626
627
	for (; *haystack != '\0' && haystack_len >= needle_len;
		haystack += move_mbright(haystack, 0), haystack_len--) {
	    if (mbstrncasecmp(haystack, needle, needle_len) == 0)
		return (char *)haystack;
628
629
	}

630
	return NULL;
631
632
    } else
#endif
633
	return (char *) strcasestr(haystack, needle);
634
635
}

636
#if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP)
637
/* This function is equivalent to strstr(), except in that it scans the
638
 * string in reverse, starting at rev_start. */
639
640
char *revstrstr(const char *haystack, const char *needle, const char
	*rev_start)
641
{
642
643
    size_t rev_start_len, needle_len;

644
645
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
646
    if (*needle == '\0')
647
	return (char *)rev_start;
648

649
    needle_len = strlen(needle);
650

651
652
    if (strlen(haystack) < needle_len)
	return NULL;
653

654
655
656
657
658
    rev_start_len = strlen(rev_start);

    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncmp(rev_start, needle,
		needle_len) == 0)
659
	    return (char *)rev_start;
660
661
662
663
    }

    return NULL;
}
664
#endif /* !NANO_TINY || !DISABLE_TABCOMP */
665

666
#ifndef NANO_TINY
667
/* This function is equivalent to strcasestr(), except in that it scans
668
 * the string in reverse, starting at rev_start. */
669
670
char *revstrcasestr(const char *haystack, const char *needle, const char
	*rev_start)
671
{
672
673
    size_t rev_start_len, needle_len;

674
675
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
676
    if (*needle == '\0')
677
	return (char *)rev_start;
678

679
680
681
682
    needle_len = strlen(needle);

    if (strlen(haystack) < needle_len)
	return NULL;
683

684
    rev_start_len = strlen(rev_start);
685

686
687
688
    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncasecmp(rev_start,
		needle, needle_len) == 0)
689
	    return (char *)rev_start;
690
691
692
693
    }

    return NULL;
}
694
695
696
697

/* This function is equivalent to strcasestr() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
698
699
char *mbrevstrcasestr(const char *haystack, const char *needle, const
	char *rev_start)
700
{
701
#ifdef ENABLE_UTF8
702
    if (use_utf8) {
703
704
	bool begin_line = FALSE;
	size_t rev_start_len, needle_len;
705
706
707

	assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
708
	if (*needle == '\0')
709
	    return (char *)rev_start;
710

711
	needle_len = mbstrlen(needle);
712

713
714
	if (mbstrlen(haystack) < needle_len)
	    return NULL;
715

716
	rev_start_len = mbstrlen(rev_start);
717

718
719
720
721
	while (!begin_line) {
	    if (rev_start_len >= needle_len && mbstrncasecmp(rev_start,
		needle, needle_len) == 0)
		return (char *)rev_start;
722
723
724

	    if (rev_start == haystack)
		begin_line = TRUE;
725
	    else {
726
727
		rev_start = haystack + move_mbleft(haystack, rev_start -
			haystack);
728
729
		rev_start_len++;
	    }
730
731
	}

732
	return NULL;
733
734
735
736
    } else
#endif
	return revstrcasestr(haystack, needle, rev_start);
}
737
#endif /* !NANO_TINY */
738

739
740
741
742
743
744
/* This function is equivalent to strlen() for multibyte strings. */
size_t mbstrlen(const char *s)
{
    return mbstrnlen(s, (size_t)-1);
}

745
746
747
748
749
750
751
752
#ifndef HAVE_STRNLEN
/* This function is equivalent to strnlen(). */
size_t nstrnlen(const char *s, size_t maxlen)
{
    size_t n = 0;

    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
753
    for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++)
754
755
756
757
758
759
760
761
762
763
764
	;

    return n;
}
#endif

/* This function is equivalent to strnlen() for multibyte strings. */
size_t mbstrnlen(const char *s, size_t maxlen)
{
    assert(s != NULL);

765
#ifdef ENABLE_UTF8
766
    if (use_utf8) {
767
768
	size_t n = 0;

769
770
771
	for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0),
		maxlen--, n++)
	    ;
772

773
	return n;
774
775
    } else
#endif
776
	return strnlen(s, maxlen);
777
}
778

779
#if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
780
/* This function is equivalent to strchr() for multibyte strings. */
781
char *mbstrchr(const char *s, const char *c)
782
783
784
785
{
    assert(s != NULL && c != NULL);

#ifdef ENABLE_UTF8
786
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
787
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
788
789
790
791
792
	char *s_mb = charalloc(MB_CUR_MAX);
	const char *q = s;
	wchar_t ws, wc;
	int c_mb_len = mbtowc(&wc, c, MB_CUR_MAX);

793
	if (c_mb_len < 0) {
794
	    mbtowc_reset();
795
796
797
798
799
	    wc = (unsigned char)*c;
	    bad_c_mb = TRUE;
	}

	while (*s != '\0') {
800
	    int s_mb_len = parse_mbchar(s, s_mb, NULL);
801

802
	    if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
803
		mbtowc_reset();
804
805
806
807
808
809
810
811
812
813
814
815
816
		ws = (unsigned char)*s;
		bad_s_mb = TRUE;
	    }

	    if (bad_s_mb == bad_c_mb && ws == wc)
		break;

	    s += s_mb_len;
	    q += s_mb_len;
	}

	free(s_mb);

817
	if (*s == '\0')
818
819
820
821
822
	    q = NULL;

	return (char *)q;
    } else
#endif
823
	return (char *) strchr(s, *c);
824
}
825
#endif /* !NANO_TINY || !DISABLE_JUSTIFY */
826

827
828
829
830
831
832
833
#ifndef NANO_TINY
/* This function is equivalent to strpbrk() for multibyte strings. */
char *mbstrpbrk(const char *s, const char *accept)
{
    assert(s != NULL && accept != NULL);

#ifdef ENABLE_UTF8
834
    if (use_utf8) {
835
	for (; *s != '\0'; s += move_mbright(s, 0)) {
836
837
838
839
840
841
842
	    if (mbstrchr(accept, s) != NULL)
		return (char *)s;
	}

	return NULL;
    } else
#endif
843
	return (char *) strpbrk(s, accept);
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
}

/* This function is equivalent to strpbrk(), except in that it scans the
 * string in reverse, starting at rev_start. */
char *revstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

    for (; rev_start >= s; rev_start--) {
	const char *q = (*rev_start == '\0') ? NULL : strchr(accept,
		*rev_start);

	if (q != NULL)
	    return (char *)rev_start;
    }

    return NULL;
}

/* This function is equivalent to strpbrk() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
char *mbrevstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

#ifdef ENABLE_UTF8
873
    if (use_utf8) {
874
875
876
	bool begin_line = FALSE;

	while (!begin_line) {
877
878
879
880
881
882
	    const char *q = (*rev_start == '\0') ? NULL :
		mbstrchr(accept, rev_start);

	    if (q != NULL)
		return (char *)rev_start;

883
884
885
886
	    if (rev_start == s)
		begin_line = TRUE;
	    else
		rev_start = s + move_mbleft(s, rev_start - s);
887
888
889
890
891
892
893
894
895
	}

	return NULL;
    } else
#endif
	return revstrpbrk(s, accept, rev_start);
}
#endif /* !NANO_TINY */

896
#if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
/* Return TRUE if the string s contains one or more blank characters,
 * and FALSE otherwise. */
bool has_blank_chars(const char *s)
{
    assert(s != NULL);

    for (; *s != '\0'; s++) {
	if (isblank(*s))
	    return TRUE;
    }

    return FALSE;
}

/* Return TRUE if the multibyte string s contains one or more blank
 * multibyte characters, and FALSE otherwise. */
bool has_blank_mbchars(const char *s)
{
915
    assert(s != NULL);
916

917
#ifdef ENABLE_UTF8
918
    if (use_utf8) {
919
	bool retval = FALSE;
920
	char *chr_mb = charalloc(MB_CUR_MAX);
921

922
923
	for (; *s != '\0'; s += move_mbright(s, 0)) {
	    parse_mbchar(s, chr_mb, NULL);
924
925
926
927
928
929
930
931
932
933
934
935
936
937

	    if (is_blank_mbchar(chr_mb)) {
		retval = TRUE;
		break;
	    }
	}

	free(chr_mb);

	return retval;
    } else
#endif
	return has_blank_chars(s);
}
938
#endif /* ENABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */
939

940
#ifdef ENABLE_UTF8
941
/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
942
943
bool is_valid_unicode(wchar_t wc)
{
944
945
946
    return ((0 <= wc && wc <= 0x10FFFF) && (wc <= 0xD7FF || 0xE000 <=
	wc) && (wc <= 0xFDCF || 0xFDF0 <= wc) && ((wc & 0xFFFF) <=
	0xFFFD));
947
948
949
}
#endif

950
951
952
953
954
955
956
#ifdef ENABLE_NANORC
/* Check if the string s is a valid multibyte string.  Return TRUE if it
 * is, and FALSE otherwise. */
bool is_valid_mbstring(const char *s)
{
    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
957
    return
958
#ifdef ENABLE_UTF8
959
	use_utf8 ? (mbstowcs(NULL, s, 0) != (size_t)-1) :
960
961
962
963
#endif
	TRUE;
}
#endif /* ENABLE_NANORC */