chars.c 22.3 KB
Newer Older
1
2
3
4
/* $Id$ */
/**************************************************************************
 *   chars.c                                                              *
 *                                                                        *
5
 *   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009   *
6
 *   Free Software Foundation, Inc.                                       *
7
8
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
9
 *   the Free Software Foundation; either version 3, or (at your option)  *
10
11
 *   any later version.                                                   *
 *                                                                        *
12
13
14
15
 *   This program is distributed in the hope that it will be useful, but  *
 *   WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    *
 *   General Public License for more details.                             *
16
17
18
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
19
20
 *   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA            *
 *   02110-1301, USA.                                                     *
21
22
23
 *                                                                        *
 **************************************************************************/

24
#include "proto.h"
25

26
#include <string.h>
27
28
#include <ctype.h>

29
#ifdef ENABLE_UTF8
30
#ifdef HAVE_WCHAR_H
31
32
#include <wchar.h>
#endif
33
#ifdef HAVE_WCTYPE_H
34
35
#include <wctype.h>
#endif
36

37
38
static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */
39
40
static const wchar_t bad_wchar = 0xFFFD;
	/* If we get an invalid multibyte sequence, we treat it as
41
42
	 * Unicode FFFD (Replacement Character), unless we're searching
	 * for a match to it. */
43
static const char *const bad_mbchar = "\xEF\xBF\xBD";
44
static const int bad_mbchar_len = 3;
45
46
47
48
49
50
51
52
53
54
55
56

/* Enable UTF-8 support. */
void utf8_init(void)
{
    use_utf8 = TRUE;
}

/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
    return use_utf8;
}
57
#endif
58

59
60
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
61
bool nisblank(int c)
62
{
63
    return isspace(c) && (c == '\t' || !is_cntrl_char(c));
64
}
65
#endif
66

67
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
68
/* This function is equivalent to iswblank(). */
69
bool niswblank(wchar_t wc)
70
{
71
    return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
72
}
73
#endif
74

75
76
77
78
79
80
81
/* Return TRUE if the value of c is in byte range, and FALSE
 * otherwise. */
bool is_byte(int c)
{
    return ((unsigned int)c == (unsigned char)c);
}

82
void mbtowc_reset(void)
83
84
85
86
{
    IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
}

87
void wctomb_reset(void)
88
89
90
91
{
    IGNORE_CALL_RESULT(wctomb(NULL, 0));
}

92
93
94
95
96
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
    assert(c != NULL);

97
#ifdef ENABLE_UTF8
98
    if (use_utf8) {
99
100
	wchar_t wc;

101
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
102
	    mbtowc_reset();
103
	    wc = bad_wchar;
104
105
106
107
108
109
110
111
	}

	return iswalnum(wc);
    } else
#endif
	return isalnum((unsigned char)*c);
}

112
113
114
115
116
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
    assert(c != NULL);

117
#ifdef ENABLE_UTF8
118
    if (use_utf8) {
119
120
	wchar_t wc;

121
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
122
	    mbtowc_reset();
123
	    wc = bad_wchar;
124
125
	}

126
	return iswblank(wc);
127
128
    } else
#endif
129
	return isblank((unsigned char)*c);
130
131
}

132
133
134
135
136
137
138
/* This function is equivalent to iscntrl(), except in that it only
 * handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
    return (0 <= c && c < 32);
}

139
/* This function is equivalent to iscntrl(), except in that it also
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
140
 * handles high-bit control characters. */
141
bool is_cntrl_char(int c)
142
{
143
144
    return (-128 <= c && c < -96) || (0 <= c && c < 32) ||
	(127 <= c && c < 160);
145
146
}

147
#ifdef ENABLE_UTF8
148
149
150
/* This function is equivalent to iscntrl() for wide characters, except
 * in that it also handles wide control characters with their high bits
 * set. */
151
bool is_cntrl_wchar(wchar_t wc)
152
{
153
    return (0 <= wc && wc < 32) || (127 <= wc && wc < 160);
154
155
156
}
#endif

157
158
159
160
161
162
163
/* This function is equivalent to iscntrl() for multibyte characters,
 * except in that it also handles multibyte control characters with
 * their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
    assert(c != NULL);

164
#ifdef ENABLE_UTF8
165
    if (use_utf8) {
166
167
	wchar_t wc;

168
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
169
	    mbtowc_reset();
170
	    wc = bad_wchar;
171
172
173
174
175
176
177
178
	}

	return is_cntrl_wchar(wc);
    } else
#endif
	return is_cntrl_char((unsigned char)*c);
}

179
180
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
181
182
183
{
    assert(c != NULL);

184
#ifdef ENABLE_UTF8
185
    if (use_utf8) {
186
187
	wchar_t wc;

Benno Schulenberg's avatar
Benno Schulenberg committed
188
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
189
	    mbtowc_reset();
190
	    wc = bad_wchar;
191
192
	}

193
	return iswpunct(wc);
194
195
    } else
#endif
196
197
198
199
	return ispunct((unsigned char)*c);
}

/* Return TRUE for a multibyte character found in a word (currently only
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
200
 * an alphanumeric or punctuation character, and only the latter if
201
202
203
204
205
206
207
 * allow_punct is TRUE) and FALSE otherwise. */
bool is_word_mbchar(const char *c, bool allow_punct)
{
    assert(c != NULL);

    return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) :
	FALSE);
208
209
}

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
210
/* c is a control character.  It displays as ^@, ^?, or ^[ch], where ch
211
 * is (c + 64).  We return that character. */
212
char control_rep(char c)
213
{
214
215
    assert(is_cntrl_char(c));

216
217
218
219
220
221
222
223
224
    /* Treat newlines embedded in a line as encoded nulls. */
    if (c == '\n')
	return '@';
    else if (c == NANO_CONTROL_8)
	return '?';
    else
	return c + 64;
}

225
#ifdef ENABLE_UTF8
226
/* c is a wide control character.  It displays as ^@, ^?, or ^[ch],
227
 * where ch is (c + 64).  We return that wide character. */
228
229
wchar_t control_wrep(wchar_t wc)
{
230
231
    assert(is_cntrl_wchar(wc));

232
233
234
235
236
237
238
239
240
241
242
    /* Treat newlines embedded in a line as encoded nulls. */
    if (wc == '\n')
	return '@';
    else if (wc == NANO_CONTROL_8)
	return '?';
    else
	return wc + 64;
}
#endif

/* c is a multibyte control character.  It displays as ^@, ^?, or ^[ch],
243
244
 * where ch is (c + 64).  We return that multibyte character.  If crep
 * is an invalid multibyte sequence, it will be replaced with Unicode
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
245
 * 0xFFFD (Replacement Character). */
246
247
char *control_mbrep(const char *c, char *crep, int *crep_len)
{
248
    assert(c != NULL && crep != NULL && crep_len != NULL);
249

250
#ifdef ENABLE_UTF8
251
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
252
	wchar_t wc;
253

254
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
255
	    mbtowc_reset();
256
	    *crep_len = bad_mbchar_len;
257
	    strncpy(crep, bad_mbchar, *crep_len);
258
259
260
261
	} else {
	    *crep_len = wctomb(crep, control_wrep(wc));

	    if (*crep_len < 0) {
262
		wctomb_reset();
263
264
		*crep_len = 0;
	    }
265
266
267
268
	}
    } else {
#endif
	*crep_len = 1;
269
	*crep = control_rep(*c);
270
#ifdef ENABLE_UTF8
271
272
    }
#endif
273
274

    return crep;
275
276
}

277
/* c is a multibyte non-control character.  We return that multibyte
278
 * character.  If crep is an invalid multibyte sequence, it will be
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
279
 * replaced with Unicode 0xFFFD (Replacement Character). */
280
281
282
283
284
char *mbrep(const char *c, char *crep, int *crep_len)
{
    assert(c != NULL && crep != NULL && crep_len != NULL);

#ifdef ENABLE_UTF8
285
    if (use_utf8) {
286
287
	wchar_t wc;

288
289
	/* Reject invalid Unicode characters. */
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
290
	    mbtowc_reset();
291
	    *crep_len = bad_mbchar_len;
292
	    strncpy(crep, bad_mbchar, *crep_len);
293
294
295
296
	} else {
	    *crep_len = wctomb(crep, wc);

	    if (*crep_len < 0) {
297
		wctomb_reset();
298
299
300
301
302
303
304
305
306
307
308
309
310
311
		*crep_len = 0;
	    }
	}
    } else {
#endif
	*crep_len = 1;
	*crep = *c;
#ifdef ENABLE_UTF8
    }
#endif

    return crep;
}

312
313
314
315
316
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
    assert(c != NULL);

317
#ifdef ENABLE_UTF8
318
    if (use_utf8) {
319
	wchar_t wc;
320
	int width;
321

322
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
323
	    mbtowc_reset();
324
	    wc = bad_wchar;
325
326
327
	}

	width = wcwidth(wc);
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
328

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
329
330
331
332
	if (width == -1) {
	    wc = bad_wchar;
	    width = wcwidth(wc);
	}
333
334
335
336
337
338
339
340
341
342

	return width;
    } else
#endif
	return 1;
}

/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
343
    return
344
#ifdef ENABLE_UTF8
345
	use_utf8 ? MB_CUR_MAX :
346
#endif
347
	1;
348
349
}

350
351
/* Convert the Unicode value in chr to a multibyte character with the
 * same wide character value as chr, if possible.  If the conversion
352
353
354
 * succeeds, return the (dynamically allocated) multibyte character and
 * its length.  Otherwise, return an undefined (dynamically allocated)
 * multibyte character and a length of zero. */
355
char *make_mbchar(long chr, int *chr_mb_len)
356
{
357
358
    char *chr_mb;

359
    assert(chr_mb_len != NULL);
360

361
#ifdef ENABLE_UTF8
362
    if (use_utf8) {
363
	chr_mb = charalloc(MB_CUR_MAX);
364
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
365

366
367
	/* Reject invalid Unicode characters. */
	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
368
	    wctomb_reset();
369
	    *chr_mb_len = 0;
370
371
372
373
	}
    } else {
#endif
	*chr_mb_len = 1;
374
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
375
#ifdef ENABLE_UTF8
376
377
378
379
380
381
382
383
    }
#endif

    return chr_mb;
}

/* Parse a multibyte character from buf.  Return the number of bytes
 * used.  If chr isn't NULL, store the multibyte character in it.  If
384
385
386
 * col isn't NULL, store the new display width in it.  If *buf is '\t',
 * we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
387
388
389
390
391
{
    int buf_mb_len;

    assert(buf != NULL);

392
#ifdef ENABLE_UTF8
393
    if (use_utf8) {
394
395
396
	/* Get the number of bytes in the multibyte character. */
	buf_mb_len = mblen(buf, MB_CUR_MAX);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
397
398
	/* If buf contains an invalid multibyte character, only
	 * interpret buf's first byte. */
399
	if (buf_mb_len < 0) {
400
	    IGNORE_CALL_RESULT(mblen(NULL, 0));
401
	    buf_mb_len = 1;
402
403
	} else if (buf_mb_len == 0)
	    buf_mb_len++;
404
405
406
407

	/* Save the multibyte character in chr. */
	if (chr != NULL) {
	    int i;
408

409
410
411
412
413
414
415
416
417
418
419
420
421
	    for (i = 0; i < buf_mb_len; i++)
		chr[i] = buf[i];
	}

	/* Save the column width of the wide character in col. */
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, get its width using one
	     * column for the "^" that will be displayed in front of it,
	     * and the width in columns of its visible equivalent as
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
422
	     * returned by control_mbrep(). */
423
	    else if (is_cntrl_mbchar(buf)) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
424
		char *ctrl_buf_mb = charalloc(MB_CUR_MAX);
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
		int ctrl_buf_mb_len;

		(*col)++;

		ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb,
			&ctrl_buf_mb_len);

		*col += mbwidth(ctrl_buf_mb);

		free(ctrl_buf_mb);
	    /* If we have a normal character, get its width in columns
	     * normally. */
	    } else
		*col += mbwidth(buf);
	}
    } else {
#endif
	/* Get the number of bytes in the byte character. */
	buf_mb_len = 1;

	/* Save the byte character in chr. */
	if (chr != NULL)
	    *chr = *buf;

	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, it's two columns wide:
	     * one column for the "^" that will be displayed in front of
	     * it, and one column for its visible equivalent as returned
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
457
	     * by control_mbrep(). */
458
459
460
461
462
463
	    else if (is_cntrl_char((unsigned char)*buf))
		*col += 2;
	    /* If we have a normal character, it's one column wide. */
	    else
		(*col)++;
	}
464
#ifdef ENABLE_UTF8
465
466
467
468
469
    }
#endif

    return buf_mb_len;
}
470
471
472
473
474
475
476

/* Return the index in buf of the beginning of the multibyte character
 * before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
    size_t pos_prev = pos;

477
    assert(buf != NULL && pos <= strlen(buf));
478
479
480
481

    /* There is no library function to move backward one multibyte
     * character.  Here is the naive, O(pos) way to do it. */
    while (TRUE) {
482
	int buf_mb_len = parse_mbchar(buf + pos - pos_prev, NULL, NULL);
483

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
484
	if (pos_prev <= buf_mb_len)
485
486
487
488
489
490
491
492
493
494
495
496
	    break;

	pos_prev -= buf_mb_len;
    }

    return pos - pos_prev;
}

/* Return the index in buf of the beginning of the multibyte character
 * after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
497
    return pos + parse_mbchar(buf + pos, NULL, NULL);
498
}
499
500
501
502
503

#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
504
    return strncasecmp(s1, s2, (size_t)-1);
505
506
507
508
509
510
}
#endif

/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
511
    return mbstrncasecmp(s1, s2, (size_t)-1);
512
513
514
515
516
517
}

#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
518
519
520
    if (s1 == s2)
	return 0;

521
522
    assert(s1 != NULL && s2 != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
523
    for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) {
524
525
526
527
	if (tolower(*s1) != tolower(*s2))
	    break;
    }

528
    return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
529
530
531
532
533
534
535
}
#endif

/* This function is equivalent to strncasecmp() for multibyte
 * strings. */
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
536
#ifdef ENABLE_UTF8
537
    if (use_utf8) {
538
	char *s1_mb, *s2_mb;
539
540
	wchar_t ws1, ws2;

541
542
543
	if (s1 == s2)
	    return 0;

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
544
545
	assert(s1 != NULL && s2 != NULL);

546
547
548
	s1_mb = charalloc(MB_CUR_MAX);
	s2_mb = charalloc(MB_CUR_MAX);

549
550
	for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1 +=
		move_mbright(s1, 0), s2 += move_mbright(s2, 0), n--) {
551
	    bool bad_s1_mb = FALSE, bad_s2_mb = FALSE;
552
553
	    int s1_mb_len, s2_mb_len;

554
	    s1_mb_len = parse_mbchar(s1, s1_mb, NULL);
555

556
	    if (mbtowc(&ws1, s1_mb, s1_mb_len) < 0) {
557
		mbtowc_reset();
558
		ws1 = (unsigned char)*s1_mb;
559
		bad_s1_mb = TRUE;
560
561
	    }

562
	    s2_mb_len = parse_mbchar(s2, s2_mb, NULL);
563

564
	    if (mbtowc(&ws2, s2_mb, s2_mb_len) < 0) {
565
		mbtowc_reset();
566
		ws2 = (unsigned char)*s2_mb;
567
		bad_s2_mb = TRUE;
568
569
	    }

570
571
	    if (bad_s1_mb != bad_s2_mb || towlower(ws1) !=
		towlower(ws2))
572
573
574
575
576
577
		break;
	}

	free(s1_mb);
	free(s2_mb);

578
	return (n > 0) ? towlower(ws1) - towlower(ws2) : 0;
579
580
    } else
#endif
581
	return strncasecmp(s1, s2, n);
582
583
584
}

#ifndef HAVE_STRCASESTR
585
/* This function is equivalent to strcasestr(). */
586
char *nstrcasestr(const char *haystack, const char *needle)
587
{
588
589
    size_t haystack_len, needle_len;

590
591
    assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
592
    if (*needle == '\0')
593
	return (char *)haystack;
594

595
596
    haystack_len = strlen(haystack);
    needle_len = strlen(needle);
597

598
599
600
    for (; *haystack != '\0' && haystack_len >= needle_len; haystack++,
	haystack_len--) {
	if (strncasecmp(haystack, needle, needle_len) == 0)
601
	    return (char *)haystack;
602
603
604
605
606
607
    }

    return NULL;
}
#endif

608
/* This function is equivalent to strcasestr() for multibyte strings. */
609
char *mbstrcasestr(const char *haystack, const char *needle)
610
{
611
#ifdef ENABLE_UTF8
612
    if (use_utf8) {
613
	size_t haystack_len, needle_len;
614
615
616

	assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
617
	if (*needle == '\0')
618
	    return (char *)haystack;
619

620
621
	haystack_len = mbstrlen(haystack);
	needle_len = mbstrlen(needle);
622

623
624
625
626
	for (; *haystack != '\0' && haystack_len >= needle_len;
		haystack += move_mbright(haystack, 0), haystack_len--) {
	    if (mbstrncasecmp(haystack, needle, needle_len) == 0)
		return (char *)haystack;
627
628
	}

629
	return NULL;
630
631
    } else
#endif
632
	return (char *) strcasestr(haystack, needle);
633
634
}

635
#if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP)
636
/* This function is equivalent to strstr(), except in that it scans the
637
 * string in reverse, starting at rev_start. */
638
639
char *revstrstr(const char *haystack, const char *needle, const char
	*rev_start)
640
{
641
642
    size_t rev_start_len, needle_len;

643
644
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
645
    if (*needle == '\0')
646
	return (char *)rev_start;
647

648
    needle_len = strlen(needle);
649

650
651
    if (strlen(haystack) < needle_len)
	return NULL;
652

653
654
655
656
657
    rev_start_len = strlen(rev_start);

    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncmp(rev_start, needle,
		needle_len) == 0)
658
	    return (char *)rev_start;
659
660
661
662
    }

    return NULL;
}
663
#endif /* !NANO_TINY || !DISABLE_TABCOMP */
664

665
#ifndef NANO_TINY
666
/* This function is equivalent to strcasestr(), except in that it scans
667
 * the string in reverse, starting at rev_start. */
668
669
char *revstrcasestr(const char *haystack, const char *needle, const char
	*rev_start)
670
{
671
672
    size_t rev_start_len, needle_len;

673
674
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
675
    if (*needle == '\0')
676
	return (char *)rev_start;
677

678
679
680
681
    needle_len = strlen(needle);

    if (strlen(haystack) < needle_len)
	return NULL;
682

683
    rev_start_len = strlen(rev_start);
684

685
686
687
    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncasecmp(rev_start,
		needle, needle_len) == 0)
688
	    return (char *)rev_start;
689
690
691
692
    }

    return NULL;
}
693
694
695
696

/* This function is equivalent to strcasestr() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
697
698
char *mbrevstrcasestr(const char *haystack, const char *needle, const
	char *rev_start)
699
{
700
#ifdef ENABLE_UTF8
701
    if (use_utf8) {
702
703
	bool begin_line = FALSE;
	size_t rev_start_len, needle_len;
704
705
706

	assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
707
	if (*needle == '\0')
708
	    return (char *)rev_start;
709

710
	needle_len = mbstrlen(needle);
711

712
713
	if (mbstrlen(haystack) < needle_len)
	    return NULL;
714

715
	rev_start_len = mbstrlen(rev_start);
716

717
718
719
720
	while (!begin_line) {
	    if (rev_start_len >= needle_len && mbstrncasecmp(rev_start,
		needle, needle_len) == 0)
		return (char *)rev_start;
721
722
723

	    if (rev_start == haystack)
		begin_line = TRUE;
724
	    else {
725
726
		rev_start = haystack + move_mbleft(haystack, rev_start -
			haystack);
727
728
		rev_start_len++;
	    }
729
730
	}

731
	return NULL;
732
733
734
735
    } else
#endif
	return revstrcasestr(haystack, needle, rev_start);
}
736
#endif /* !NANO_TINY */
737

738
739
740
741
742
743
/* This function is equivalent to strlen() for multibyte strings. */
size_t mbstrlen(const char *s)
{
    return mbstrnlen(s, (size_t)-1);
}

744
745
746
747
748
749
750
751
#ifndef HAVE_STRNLEN
/* This function is equivalent to strnlen(). */
size_t nstrnlen(const char *s, size_t maxlen)
{
    size_t n = 0;

    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
752
    for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++)
753
754
755
756
757
758
759
760
761
762
763
	;

    return n;
}
#endif

/* This function is equivalent to strnlen() for multibyte strings. */
size_t mbstrnlen(const char *s, size_t maxlen)
{
    assert(s != NULL);

764
#ifdef ENABLE_UTF8
765
    if (use_utf8) {
766
767
	size_t n = 0;

768
769
770
	for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0),
		maxlen--, n++)
	    ;
771

772
	return n;
773
774
    } else
#endif
775
	return strnlen(s, maxlen);
776
}
777

778
#if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
779
/* This function is equivalent to strchr() for multibyte strings. */
780
char *mbstrchr(const char *s, const char *c)
781
782
783
784
{
    assert(s != NULL && c != NULL);

#ifdef ENABLE_UTF8
785
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
786
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
787
788
789
790
	char *s_mb = charalloc(MB_CUR_MAX);
	const char *q = s;
	wchar_t ws, wc;

Benno Schulenberg's avatar
Benno Schulenberg committed
791
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
792
	    mbtowc_reset();
793
794
795
796
797
	    wc = (unsigned char)*c;
	    bad_c_mb = TRUE;
	}

	while (*s != '\0') {
798
	    int s_mb_len = parse_mbchar(s, s_mb, NULL);
799

800
	    if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
801
		mbtowc_reset();
802
803
804
805
806
807
808
809
810
811
812
813
814
		ws = (unsigned char)*s;
		bad_s_mb = TRUE;
	    }

	    if (bad_s_mb == bad_c_mb && ws == wc)
		break;

	    s += s_mb_len;
	    q += s_mb_len;
	}

	free(s_mb);

815
	if (*s == '\0')
816
817
818
819
820
	    q = NULL;

	return (char *)q;
    } else
#endif
821
	return (char *) strchr(s, *c);
822
}
823
#endif /* !NANO_TINY || !DISABLE_JUSTIFY */
824

825
826
827
828
829
830
831
#ifndef NANO_TINY
/* This function is equivalent to strpbrk() for multibyte strings. */
char *mbstrpbrk(const char *s, const char *accept)
{
    assert(s != NULL && accept != NULL);

#ifdef ENABLE_UTF8
832
    if (use_utf8) {
833
	for (; *s != '\0'; s += move_mbright(s, 0)) {
834
835
836
837
838
839
840
	    if (mbstrchr(accept, s) != NULL)
		return (char *)s;
	}

	return NULL;
    } else
#endif
841
	return (char *) strpbrk(s, accept);
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
}

/* This function is equivalent to strpbrk(), except in that it scans the
 * string in reverse, starting at rev_start. */
char *revstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

    for (; rev_start >= s; rev_start--) {
	const char *q = (*rev_start == '\0') ? NULL : strchr(accept,
		*rev_start);

	if (q != NULL)
	    return (char *)rev_start;
    }

    return NULL;
}

/* This function is equivalent to strpbrk() for multibyte strings,
 * except in that it scans the string in reverse, starting at
 * rev_start. */
char *mbrevstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

#ifdef ENABLE_UTF8
871
    if (use_utf8) {
872
873
874
	bool begin_line = FALSE;

	while (!begin_line) {
875
876
877
878
879
880
	    const char *q = (*rev_start == '\0') ? NULL :
		mbstrchr(accept, rev_start);

	    if (q != NULL)
		return (char *)rev_start;

881
882
883
884
	    if (rev_start == s)
		begin_line = TRUE;
	    else
		rev_start = s + move_mbleft(s, rev_start - s);
885
886
887
888
889
890
891
892
893
	}

	return NULL;
    } else
#endif
	return revstrpbrk(s, accept, rev_start);
}
#endif /* !NANO_TINY */

894
#if defined(ENABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
/* Return TRUE if the string s contains one or more blank characters,
 * and FALSE otherwise. */
bool has_blank_chars(const char *s)
{
    assert(s != NULL);

    for (; *s != '\0'; s++) {
	if (isblank(*s))
	    return TRUE;
    }

    return FALSE;
}

/* Return TRUE if the multibyte string s contains one or more blank
 * multibyte characters, and FALSE otherwise. */
bool has_blank_mbchars(const char *s)
{
913
    assert(s != NULL);
914

915
#ifdef ENABLE_UTF8
916
    if (use_utf8) {
917
	bool retval = FALSE;
918
	char *chr_mb = charalloc(MB_CUR_MAX);
919

920
921
	for (; *s != '\0'; s += move_mbright(s, 0)) {
	    parse_mbchar(s, chr_mb, NULL);
922
923
924
925
926
927
928
929
930
931
932
933
934
935

	    if (is_blank_mbchar(chr_mb)) {
		retval = TRUE;
		break;
	    }
	}

	free(chr_mb);

	return retval;
    } else
#endif
	return has_blank_chars(s);
}
936
#endif /* ENABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */
937

938
#ifdef ENABLE_UTF8
939
/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
940
941
bool is_valid_unicode(wchar_t wc)
{
942
943
944
    return ((0 <= wc && wc <= 0x10FFFF) && (wc <= 0xD7FF || 0xE000 <=
	wc) && (wc <= 0xFDCF || 0xFDF0 <= wc) && ((wc & 0xFFFF) <=
	0xFFFD));
945
946
947
}
#endif

948
949
950
951
952
953
954
#ifdef ENABLE_NANORC
/* Check if the string s is a valid multibyte string.  Return TRUE if it
 * is, and FALSE otherwise. */
bool is_valid_mbstring(const char *s)
{
    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
955
    return
956
#ifdef ENABLE_UTF8
957
	use_utf8 ? (mbstowcs(NULL, s, 0) != (size_t)-1) :
958
959
960
961
#endif
	TRUE;
}
#endif /* ENABLE_NANORC */