chars.c 22.4 KB
Newer Older
1
2
3
/**************************************************************************
 *   chars.c                                                              *
 *                                                                        *
4
5
 *   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,  *
 *   2010, 2011, 2013, 2014 Free Software Foundation, Inc.                *
6
7
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
8
 *   the Free Software Foundation; either version 3, or (at your option)  *
9
10
 *   any later version.                                                   *
 *                                                                        *
11
12
13
14
 *   This program is distributed in the hope that it will be useful, but  *
 *   WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    *
 *   General Public License for more details.                             *
15
16
17
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
18
19
 *   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA            *
 *   02110-1301, USA.                                                     *
20
21
22
 *                                                                        *
 **************************************************************************/

23
#include "proto.h"
24

25
#include <string.h>
26
27
#include <ctype.h>

28
#ifdef ENABLE_UTF8
29
#ifdef HAVE_WCHAR_H
30
31
#include <wchar.h>
#endif
32
#ifdef HAVE_WCTYPE_H
33
34
#include <wctype.h>
#endif
35

36
37
static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */
38
39
static const wchar_t bad_wchar = 0xFFFD;
	/* If we get an invalid multibyte sequence, we treat it as
40
41
	 * Unicode FFFD (Replacement Character), unless we're searching
	 * for a match to it. */
42
static const char *const bad_mbchar = "\xEF\xBF\xBD";
43
static const int bad_mbchar_len = 3;
44
45
46
47
48
49
50
51
52
53
54
55

/* Enable UTF-8 support. */
void utf8_init(void)
{
    use_utf8 = TRUE;
}

/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
    return use_utf8;
}
Benno Schulenberg's avatar
Benno Schulenberg committed
56
#endif /* ENABLE_UTF8 */
57

58
/* Concatenate two allocated strings, and free the second. */
59
char *addstrings(char* str1, size_t len1, char* str2, size_t len2)
60
61
62
{
    str1 = charealloc(str1, len1 + len2 + 1);
    str1[len1] = '\0';
63

64
65
66
67
68
69
    strncat(&str1[len1], str2, len2);
    free(str2);

    return str1;
}

70
71
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
72
bool nisblank(int c)
73
{
74
    return isspace(c) && (c == '\t' || !is_cntrl_char(c));
75
}
76
#endif
77

78
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
79
/* This function is equivalent to iswblank(). */
80
bool niswblank(wchar_t wc)
81
{
82
    return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
83
}
84
#endif
85

86
/* Return TRUE if the value of c is in byte range, and FALSE otherwise. */
87
88
89
90
91
bool is_byte(int c)
{
    return ((unsigned int)c == (unsigned char)c);
}

92
93
94
95
96
97
98
99
100
101
void mbtowc_reset(void)
{
    IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
}

void wctomb_reset(void)
{
    IGNORE_CALL_RESULT(wctomb(NULL, 0));
}

102
103
104
105
106
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
    assert(c != NULL);

107
#ifdef ENABLE_UTF8
108
    if (use_utf8) {
109
110
	wchar_t wc;

111
112
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
113
	    wc = bad_wchar;
114
	}
115
116
117
118
119
120
121

	return iswalnum(wc);
    } else
#endif
	return isalnum((unsigned char)*c);
}

122
123
124
125
126
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
    assert(c != NULL);

127
#ifdef ENABLE_UTF8
128
    if (use_utf8) {
129
130
	wchar_t wc;

131
132
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
133
	    wc = bad_wchar;
134
	}
135

136
	return iswblank(wc);
137
138
    } else
#endif
139
	return isblank((unsigned char)*c);
140
141
}

142
143
144
145
146
147
148
/* This function is equivalent to iscntrl(), except in that it only
 * handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
    return (0 <= c && c < 32);
}

149
/* This function is equivalent to iscntrl(), except in that it also
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
150
 * handles high-bit control characters. */
151
bool is_cntrl_char(int c)
152
{
153
154
    return (-128 <= c && c < -96) || (0 <= c && c < 32) ||
	(127 <= c && c < 160);
155
156
}

157
#ifdef ENABLE_UTF8
158
159
160
/* This function is equivalent to iscntrl() for wide characters, except
 * in that it also handles wide control characters with their high bits
 * set. */
161
bool is_cntrl_wchar(wchar_t wc)
162
{
163
    return (0 <= wc && wc < 32) || (127 <= wc && wc < 160);
164
165
166
}
#endif

167
168
169
170
171
172
173
/* This function is equivalent to iscntrl() for multibyte characters,
 * except in that it also handles multibyte control characters with
 * their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
    assert(c != NULL);

174
#ifdef ENABLE_UTF8
175
    if (use_utf8) {
176
177
	wchar_t wc;

178
179
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
180
	    wc = bad_wchar;
181
	}
182
183
184
185
186
187
188

	return is_cntrl_wchar(wc);
    } else
#endif
	return is_cntrl_char((unsigned char)*c);
}

189
190
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
191
192
193
{
    assert(c != NULL);

194
#ifdef ENABLE_UTF8
195
    if (use_utf8) {
196
197
	wchar_t wc;

198
199
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
200
	    wc = bad_wchar;
201
	}
202

203
	return iswpunct(wc);
204
205
    } else
#endif
206
207
208
209
	return ispunct((unsigned char)*c);
}

/* Return TRUE for a multibyte character found in a word (currently only
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
210
 * an alphanumeric or punctuation character, and only the latter if
211
212
213
214
215
216
217
 * allow_punct is TRUE) and FALSE otherwise. */
bool is_word_mbchar(const char *c, bool allow_punct)
{
    assert(c != NULL);

    return is_alnum_mbchar(c) || (allow_punct ? is_punct_mbchar(c) :
	FALSE);
218
219
}

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
220
/* c is a control character.  It displays as ^@, ^?, or ^[ch], where ch
221
 * is (c + 64).  We return that character. */
222
char control_rep(char c)
223
{
224
225
    assert(is_cntrl_char(c));

226
227
228
229
230
231
232
233
234
    /* Treat newlines embedded in a line as encoded nulls. */
    if (c == '\n')
	return '@';
    else if (c == NANO_CONTROL_8)
	return '?';
    else
	return c + 64;
}

235
#ifdef ENABLE_UTF8
236
/* c is a wide control character.  It displays as ^@, ^?, or ^[ch],
237
 * where ch is (c + 64).  We return that wide character. */
238
239
wchar_t control_wrep(wchar_t wc)
{
240
241
    assert(is_cntrl_wchar(wc));

242
243
244
245
246
247
248
249
250
251
252
    /* Treat newlines embedded in a line as encoded nulls. */
    if (wc == '\n')
	return '@';
    else if (wc == NANO_CONTROL_8)
	return '?';
    else
	return wc + 64;
}
#endif

/* c is a multibyte control character.  It displays as ^@, ^?, or ^[ch],
253
254
 * where ch is (c + 64).  We return that multibyte character.  If crep
 * is an invalid multibyte sequence, it will be replaced with Unicode
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
255
 * 0xFFFD (Replacement Character). */
256
257
char *control_mbrep(const char *c, char *crep, int *crep_len)
{
258
    assert(c != NULL && crep != NULL && crep_len != NULL);
259

260
#ifdef ENABLE_UTF8
261
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
262
	wchar_t wc;
263

264
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
265
	    mbtowc_reset();
266
	    *crep_len = bad_mbchar_len;
267
	    strncpy(crep, bad_mbchar, *crep_len);
268
269
270
	} else {
	    *crep_len = wctomb(crep, control_wrep(wc));

271
272
	    if (*crep_len < 0) {
		wctomb_reset();
273
		*crep_len = 0;
274
	    }
275
	}
276
    } else
277
#endif
278
    {
279
	*crep_len = 1;
280
	*crep = control_rep(*c);
281
    }
282
283

    return crep;
284
285
}

286
/* c is a multibyte non-control character.  We return that multibyte
287
 * character.  If crep is an invalid multibyte sequence, it will be
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
288
 * replaced with Unicode 0xFFFD (Replacement Character). */
289
290
291
292
293
char *mbrep(const char *c, char *crep, int *crep_len)
{
    assert(c != NULL && crep != NULL && crep_len != NULL);

#ifdef ENABLE_UTF8
294
    if (use_utf8) {
295
296
	wchar_t wc;

297
298
	/* Reject invalid Unicode characters. */
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || !is_valid_unicode(wc)) {
299
	    mbtowc_reset();
300
	    *crep_len = bad_mbchar_len;
301
	    strncpy(crep, bad_mbchar, *crep_len);
302
303
304
	} else {
	    *crep_len = wctomb(crep, wc);

305
306
	    if (*crep_len < 0) {
		wctomb_reset();
307
		*crep_len = 0;
308
	    }
309
	}
310
    } else
311
#endif
312
    {
313
314
315
316
317
318
319
	*crep_len = 1;
	*crep = *c;
    }

    return crep;
}

320
321
322
323
324
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
    assert(c != NULL);

325
#ifdef ENABLE_UTF8
326
    if (use_utf8) {
327
	wchar_t wc;
328
	int width;
329

330
331
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
332
	    wc = bad_wchar;
333
	}
334
335

	width = wcwidth(wc);
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
336

337
338
339
340
	if (width == -1) {
	    wc = bad_wchar;
	    width = wcwidth(wc);
	}
341
342
343
344
345
346
347
348
349
350

	return width;
    } else
#endif
	return 1;
}

/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
351
    return
352
#ifdef ENABLE_UTF8
353
	use_utf8 ? MB_CUR_MAX :
354
#endif
355
	1;
356
357
}

358
359
/* Convert the Unicode value in chr to a multibyte character with the
 * same wide character value as chr, if possible.  If the conversion
360
361
362
 * succeeds, return the (dynamically allocated) multibyte character and
 * its length.  Otherwise, return an undefined (dynamically allocated)
 * multibyte character and a length of zero. */
363
char *make_mbchar(long chr, int *chr_mb_len)
364
{
365
366
    char *chr_mb;

367
    assert(chr_mb_len != NULL);
368

369
#ifdef ENABLE_UTF8
370
    if (use_utf8) {
371
	chr_mb = charalloc(MB_CUR_MAX);
372
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
373

374
	/* Reject invalid Unicode characters. */
375
376
	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
	    wctomb_reset();
377
	    *chr_mb_len = 0;
378
	}
379
    } else
380
#endif
381
    {
382
	*chr_mb_len = 1;
383
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
384
385
386
387
388
389
390
    }

    return chr_mb;
}

/* Parse a multibyte character from buf.  Return the number of bytes
 * used.  If chr isn't NULL, store the multibyte character in it.  If
391
392
393
 * col isn't NULL, store the new display width in it.  If *buf is '\t',
 * we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
394
395
396
397
398
{
    int buf_mb_len;

    assert(buf != NULL);

399
#ifdef ENABLE_UTF8
400
    if (use_utf8) {
401
402
403
	/* Get the number of bytes in the multibyte character. */
	buf_mb_len = mblen(buf, MB_CUR_MAX);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
404
405
	/* If buf contains an invalid multibyte character, only
	 * interpret buf's first byte. */
406
	if (buf_mb_len < 0) {
407
	    IGNORE_CALL_RESULT(mblen(NULL, 0));
408
	    buf_mb_len = 1;
409
410
	} else if (buf_mb_len == 0)
	    buf_mb_len++;
411
412
413
414

	/* Save the multibyte character in chr. */
	if (chr != NULL) {
	    int i;
415

416
417
418
419
420
421
422
423
424
425
426
427
428
	    for (i = 0; i < buf_mb_len; i++)
		chr[i] = buf[i];
	}

	/* Save the column width of the wide character in col. */
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, get its width using one
	     * column for the "^" that will be displayed in front of it,
	     * and the width in columns of its visible equivalent as
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
429
	     * returned by control_mbrep(). */
430
	    else if (is_cntrl_mbchar(buf)) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
431
		char *ctrl_buf_mb = charalloc(MB_CUR_MAX);
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
		int ctrl_buf_mb_len;

		(*col)++;

		ctrl_buf_mb = control_mbrep(buf, ctrl_buf_mb,
			&ctrl_buf_mb_len);

		*col += mbwidth(ctrl_buf_mb);

		free(ctrl_buf_mb);
	    /* If we have a normal character, get its width in columns
	     * normally. */
	    } else
		*col += mbwidth(buf);
	}
447
    } else
448
#endif
449
    {
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
	/* Get the number of bytes in the byte character. */
	buf_mb_len = 1;

	/* Save the byte character in chr. */
	if (chr != NULL)
	    *chr = *buf;

	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
	    /* If we have a control character, it's two columns wide:
	     * one column for the "^" that will be displayed in front of
	     * it, and one column for its visible equivalent as returned
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
465
	     * by control_mbrep(). */
466
467
468
469
470
471
472
473
474
475
	    else if (is_cntrl_char((unsigned char)*buf))
		*col += 2;
	    /* If we have a normal character, it's one column wide. */
	    else
		(*col)++;
	}
    }

    return buf_mb_len;
}
476
477
478
479
480

/* Return the index in buf of the beginning of the multibyte character
 * before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
481
    size_t before, char_len = 0;
482

483
    assert(buf != NULL && pos <= strlen(buf));
484
485

    /* There is no library function to move backward one multibyte
486
487
488
489
490
491
492
     * character.  So we just start groping for one at the farthest
     * possible point. */
    if (mb_cur_max() > pos)
	before = 0;
    else
	before = pos - mb_cur_max();

493
494
495
    while (before < pos) {
	char_len = parse_mbchar(buf + before, NULL, NULL);
	before += char_len;
496
497
    }

498
    return before - char_len;
499
500
501
502
503
504
}

/* Return the index in buf of the beginning of the multibyte character
 * after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
505
    return pos + parse_mbchar(buf + pos, NULL, NULL);
506
}
507
508
509
510
511

#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
512
    return strncasecmp(s1, s2, (size_t)-1);
513
514
515
516
517
518
}
#endif

/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
519
    return mbstrncasecmp(s1, s2, (size_t)-1);
520
521
522
523
524
525
}

#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
526
527
528
    if (s1 == s2)
	return 0;

529
530
    assert(s1 != NULL && s2 != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
531
    for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) {
532
533
534
535
	if (tolower(*s1) != tolower(*s2))
	    break;
    }

536
    return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
537
538
539
}
#endif

540
/* This function is equivalent to strncasecmp() for multibyte strings. */
541
542
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
543
#ifdef ENABLE_UTF8
544
    if (use_utf8) {
545
	wchar_t wc1, wc2;
546

547
548
549
	if (s1 == s2)
	    return 0;

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
550
551
	assert(s1 != NULL && s2 != NULL);

552
553
	for (; *s1 != '\0' && *s2 != '\0' && n > 0;
		s1 += move_mbright(s1, 0), s2 += move_mbright(s2, 0), n--) {
554
	    bool bad1 = FALSE, bad2 = FALSE;
555

556
	    if (mbtowc(&wc1, s1, MB_CUR_MAX) < 0) {
557
		mbtowc_reset();
558
		wc1 = (unsigned char)*s1;
559
		bad1 = TRUE;
560
561
	    }

562
	    if (mbtowc(&wc2, s2, MB_CUR_MAX) < 0) {
563
		mbtowc_reset();
564
		wc2 = (unsigned char)*s2;
565
		bad2 = TRUE;
566
567
	    }

568
	    if (bad1 != bad2 || towlower(wc1) != towlower(wc2))
569
570
571
		break;
	}

572
	return (n > 0) ? towlower(wc1) - towlower(wc2) : 0;
573
574
    } else
#endif
575
	return strncasecmp(s1, s2, n);
576
577
578
}

#ifndef HAVE_STRCASESTR
579
/* This function is equivalent to strcasestr(). */
580
char *nstrcasestr(const char *haystack, const char *needle)
581
{
582
583
    size_t haystack_len, needle_len;

584
585
    assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
586
    if (*needle == '\0')
587
	return (char *)haystack;
588

589
590
    haystack_len = strlen(haystack);
    needle_len = strlen(needle);
591

592
593
594
    for (; *haystack != '\0' && haystack_len >= needle_len; haystack++,
	haystack_len--) {
	if (strncasecmp(haystack, needle, needle_len) == 0)
595
	    return (char *)haystack;
596
597
598
599
600
601
    }

    return NULL;
}
#endif

602
/* This function is equivalent to strcasestr() for multibyte strings. */
603
char *mbstrcasestr(const char *haystack, const char *needle)
604
{
605
#ifdef ENABLE_UTF8
606
    if (use_utf8) {
607
	size_t haystack_len, needle_len;
608
609
610

	assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
611
	if (*needle == '\0')
612
	    return (char *)haystack;
613

614
615
	haystack_len = mbstrlen(haystack);
	needle_len = mbstrlen(needle);
616

617
618
	for (; *haystack != '\0' && haystack_len >= needle_len;
		haystack += move_mbright(haystack, 0), haystack_len--) {
619
620
	    if (mbstrncasecmp(haystack, needle, needle_len) == 0 &&
			mblen(haystack, MB_CUR_MAX) > 0)
621
		return (char *)haystack;
622
623
	}

624
	return NULL;
625
626
    } else
#endif
627
	return (char *) strcasestr(haystack, needle);
628
629
}

630
#if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP)
631
/* This function is equivalent to strstr(), except in that it scans the
632
 * string in reverse, starting at rev_start. */
633
634
char *revstrstr(const char *haystack, const char *needle, const char
	*rev_start)
635
{
636
637
    size_t rev_start_len, needle_len;

638
639
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
640
    if (*needle == '\0')
641
	return (char *)rev_start;
642

643
    needle_len = strlen(needle);
644

645
646
    if (strlen(haystack) < needle_len)
	return NULL;
647

648
649
650
651
652
    rev_start_len = strlen(rev_start);

    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncmp(rev_start, needle,
		needle_len) == 0)
653
	    return (char *)rev_start;
654
655
656
657
    }

    return NULL;
}
658
#endif /* !NANO_TINY || !DISABLE_TABCOMP */
659

660
#ifndef NANO_TINY
661
/* This function is equivalent to strcasestr(), except in that it scans
662
 * the string in reverse, starting at rev_start. */
663
664
char *revstrcasestr(const char *haystack, const char *needle, const char
	*rev_start)
665
{
666
667
    size_t rev_start_len, needle_len;

668
669
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
670
    if (*needle == '\0')
671
	return (char *)rev_start;
672

673
674
675
676
    needle_len = strlen(needle);

    if (strlen(haystack) < needle_len)
	return NULL;
677

678
    rev_start_len = strlen(rev_start);
679

680
681
682
    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncasecmp(rev_start,
		needle, needle_len) == 0)
683
	    return (char *)rev_start;
684
685
686
687
    }

    return NULL;
}
688
689

/* This function is equivalent to strcasestr() for multibyte strings,
690
 * except in that it scans the string in reverse, starting at rev_start. */
691
692
char *mbrevstrcasestr(const char *haystack, const char *needle, const
	char *rev_start)
693
{
694
#ifdef ENABLE_UTF8
695
    if (use_utf8) {
696
697
	bool begin_line = FALSE;
	size_t rev_start_len, needle_len;
698
699
700

	assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
701
	if (*needle == '\0')
702
	    return (char *)rev_start;
703

704
	needle_len = mbstrlen(needle);
705

706
707
	if (mbstrlen(haystack) < needle_len)
	    return NULL;
708

709
	rev_start_len = mbstrlen(rev_start);
710

711
	while (!begin_line) {
712
713
714
	    if (rev_start_len >= needle_len &&
			mbstrncasecmp(rev_start, needle, needle_len) == 0 &&
			mblen(rev_start, MB_CUR_MAX) > 0)
715
		return (char *)rev_start;
716
717
718

	    if (rev_start == haystack)
		begin_line = TRUE;
719
	    else {
720
721
		rev_start = haystack + move_mbleft(haystack, rev_start -
			haystack);
722
723
		rev_start_len++;
	    }
724
725
	}

726
	return NULL;
727
728
729
730
    } else
#endif
	return revstrcasestr(haystack, needle, rev_start);
}
731
#endif /* !NANO_TINY */
732

733
734
735
736
737
738
/* This function is equivalent to strlen() for multibyte strings. */
size_t mbstrlen(const char *s)
{
    return mbstrnlen(s, (size_t)-1);
}

739
740
741
742
743
744
745
746
#ifndef HAVE_STRNLEN
/* This function is equivalent to strnlen(). */
size_t nstrnlen(const char *s, size_t maxlen)
{
    size_t n = 0;

    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
747
    for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++)
748
749
750
751
752
753
754
755
756
757
758
	;

    return n;
}
#endif

/* This function is equivalent to strnlen() for multibyte strings. */
size_t mbstrnlen(const char *s, size_t maxlen)
{
    assert(s != NULL);

759
#ifdef ENABLE_UTF8
760
    if (use_utf8) {
761
762
	size_t n = 0;

763
764
765
	for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0),
		maxlen--, n++)
	    ;
766

767
	return n;
768
769
    } else
#endif
770
	return strnlen(s, maxlen);
771
}
772

773
#if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
774
/* This function is equivalent to strchr() for multibyte strings. */
775
char *mbstrchr(const char *s, const char *c)
776
777
778
779
{
    assert(s != NULL && c != NULL);

#ifdef ENABLE_UTF8
780
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
781
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
782
783
784
785
	char *s_mb = charalloc(MB_CUR_MAX);
	const char *q = s;
	wchar_t ws, wc;

Benno Schulenberg's avatar
Benno Schulenberg committed
786
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
787
	    mbtowc_reset();
788
789
790
791
792
	    wc = (unsigned char)*c;
	    bad_c_mb = TRUE;
	}

	while (*s != '\0') {
793
	    int s_mb_len = parse_mbchar(s, s_mb, NULL);
794

795
	    if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
796
		mbtowc_reset();
797
798
799
800
801
802
803
804
805
806
807
808
809
		ws = (unsigned char)*s;
		bad_s_mb = TRUE;
	    }

	    if (bad_s_mb == bad_c_mb && ws == wc)
		break;

	    s += s_mb_len;
	    q += s_mb_len;
	}

	free(s_mb);

810
	if (*s == '\0')
811
812
813
814
815
	    q = NULL;

	return (char *)q;
    } else
#endif
816
	return (char *) strchr(s, *c);
817
}
818
#endif /* !NANO_TINY || !DISABLE_JUSTIFY */
819

820
821
822
823
824
825
826
#ifndef NANO_TINY
/* This function is equivalent to strpbrk() for multibyte strings. */
char *mbstrpbrk(const char *s, const char *accept)
{
    assert(s != NULL && accept != NULL);

#ifdef ENABLE_UTF8
827
    if (use_utf8) {
828
	for (; *s != '\0'; s += move_mbright(s, 0)) {
829
830
831
832
833
834
835
	    if (mbstrchr(accept, s) != NULL)
		return (char *)s;
	}

	return NULL;
    } else
#endif
836
	return (char *) strpbrk(s, accept);
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
}

/* This function is equivalent to strpbrk(), except in that it scans the
 * string in reverse, starting at rev_start. */
char *revstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

    for (; rev_start >= s; rev_start--) {
	const char *q = (*rev_start == '\0') ? NULL : strchr(accept,
		*rev_start);

	if (q != NULL)
	    return (char *)rev_start;
    }

    return NULL;
}

/* This function is equivalent to strpbrk() for multibyte strings,
858
 * except in that it scans the string in reverse, starting at rev_start. */
859
860
861
862
863
864
char *mbrevstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

#ifdef ENABLE_UTF8
865
    if (use_utf8) {
866
867
868
	bool begin_line = FALSE;

	while (!begin_line) {
869
870
871
872
873
874
	    const char *q = (*rev_start == '\0') ? NULL :
		mbstrchr(accept, rev_start);

	    if (q != NULL)
		return (char *)rev_start;

875
876
877
878
	    if (rev_start == s)
		begin_line = TRUE;
	    else
		rev_start = s + move_mbleft(s, rev_start - s);
879
880
881
882
883
884
885
886
887
	}

	return NULL;
    } else
#endif
	return revstrpbrk(s, accept, rev_start);
}
#endif /* !NANO_TINY */

888
#if !defined(DISABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
/* Return TRUE if the string s contains one or more blank characters,
 * and FALSE otherwise. */
bool has_blank_chars(const char *s)
{
    assert(s != NULL);

    for (; *s != '\0'; s++) {
	if (isblank(*s))
	    return TRUE;
    }

    return FALSE;
}

/* Return TRUE if the multibyte string s contains one or more blank
 * multibyte characters, and FALSE otherwise. */
bool has_blank_mbchars(const char *s)
{
907
    assert(s != NULL);
908

909
#ifdef ENABLE_UTF8
910
    if (use_utf8) {
911
	bool retval = FALSE;
912
	char *chr_mb = charalloc(MB_CUR_MAX);
913

914
915
	for (; *s != '\0'; s += move_mbright(s, 0)) {
	    parse_mbchar(s, chr_mb, NULL);
916
917
918
919
920
921
922
923
924
925
926
927
928
929

	    if (is_blank_mbchar(chr_mb)) {
		retval = TRUE;
		break;
	    }
	}

	free(chr_mb);

	return retval;
    } else
#endif
	return has_blank_chars(s);
}
930
#endif /* !DISABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */
931

932
#ifdef ENABLE_UTF8
933
/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
934
935
bool is_valid_unicode(wchar_t wc)
{
936
    return ((0 <= wc && wc <= 0xD7FF) ||
937
938
939
		(0xE000 <= wc && wc <= 0xFDCF) ||
		(0xFDF0 <= wc && wc <= 0xFFFD) ||
		(0xFFFF < wc && wc <= 0x10FFFF && (wc & 0xFFFF) <= 0xFFFD));
940
941
942
}
#endif

943
#ifndef DISABLE_NANORC
944
945
946
947
948
949
/* Check if the string s is a valid multibyte string.  Return TRUE if it
 * is, and FALSE otherwise. */
bool is_valid_mbstring(const char *s)
{
    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
950
    return
951
#ifdef ENABLE_UTF8
952
	use_utf8 ? (mbstowcs(NULL, s, 0) != (size_t)-1) :
953
954
955
#endif
	TRUE;
}
956
#endif /* !DISABLE_NANORC */