chars.c 21.5 KB
Newer Older
1
2
3
/**************************************************************************
 *   chars.c                                                              *
 *                                                                        *
4
5
 *   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,  *
 *   2010, 2011, 2013, 2014 Free Software Foundation, Inc.                *
6
7
 *   Copyright (C) 2016 Benno Schulenberg                                 *
 *                                                                        *
8
9
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
10
 *   the Free Software Foundation; either version 3, or (at your option)  *
11
12
 *   any later version.                                                   *
 *                                                                        *
13
14
15
16
 *   This program is distributed in the hope that it will be useful, but  *
 *   WITHOUT ANY WARRANTY; without even the implied warranty of           *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    *
 *   General Public License for more details.                             *
17
18
19
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
20
21
 *   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA            *
 *   02110-1301, USA.                                                     *
22
23
24
 *                                                                        *
 **************************************************************************/

25
#include "proto.h"
26

27
#include <string.h>
28
29
#include <ctype.h>

30
#ifdef ENABLE_UTF8
31
#ifdef HAVE_WCHAR_H
32
33
#include <wchar.h>
#endif
34
#ifdef HAVE_WCTYPE_H
35
36
#include <wctype.h>
#endif
37

38
39
40
41
42
43
44
45
46
47
48
49
50
51
static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */

/* Enable UTF-8 support. */
void utf8_init(void)
{
    use_utf8 = TRUE;
}

/* Is UTF-8 support enabled? */
bool using_utf8(void)
{
    return use_utf8;
}
Benno Schulenberg's avatar
Benno Schulenberg committed
52
#endif /* ENABLE_UTF8 */
53

54
/* Concatenate two allocated strings, and free the second. */
55
char *addstrings(char* str1, size_t len1, char* str2, size_t len2)
56
57
58
{
    str1 = charealloc(str1, len1 + len2 + 1);
    str1[len1] = '\0';
59

60
61
62
63
64
65
    strncat(&str1[len1], str2, len2);
    free(str2);

    return str1;
}

66
67
#ifndef HAVE_ISBLANK
/* This function is equivalent to isblank(). */
68
bool nisblank(int c)
69
{
70
    return isspace(c) && (c == '\t' || !is_cntrl_char(c));
71
}
72
#endif
73

74
#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
75
/* This function is equivalent to iswblank(). */
76
bool niswblank(wchar_t wc)
77
{
78
    return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
79
}
80
#endif
81

82
/* Return TRUE if the value of c is in byte range, and FALSE otherwise. */
83
84
85
86
87
bool is_byte(int c)
{
    return ((unsigned int)c == (unsigned char)c);
}

88
89
90
91
92
93
94
95
96
97
void mbtowc_reset(void)
{
    IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
}

void wctomb_reset(void)
{
    IGNORE_CALL_RESULT(wctomb(NULL, 0));
}

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/* This function is equivalent to isalpha() for multibyte characters. */
bool is_alpha_mbchar(const char *c)
{
    assert(c != NULL);

#ifdef ENABLE_UTF8
    if (use_utf8) {
	wchar_t wc;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
	    return 0;
	}

	return iswalpha(wc);
    } else
#endif
	return isalpha((unsigned char)*c);
}

118
119
120
121
122
/* This function is equivalent to isalnum() for multibyte characters. */
bool is_alnum_mbchar(const char *c)
{
    assert(c != NULL);

123
#ifdef ENABLE_UTF8
124
    if (use_utf8) {
125
126
	wchar_t wc;

127
128
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
129
	    return 0;
130
	}
131
132
133
134
135
136
137

	return iswalnum(wc);
    } else
#endif
	return isalnum((unsigned char)*c);
}

138
139
140
141
142
/* This function is equivalent to isblank() for multibyte characters. */
bool is_blank_mbchar(const char *c)
{
    assert(c != NULL);

143
#ifdef ENABLE_UTF8
144
    if (use_utf8) {
145
146
	wchar_t wc;

147
148
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
149
	    return 0;
150
	}
151

152
	return iswblank(wc);
153
154
    } else
#endif
155
	return isblank((unsigned char)*c);
156
157
}

158
159
160
161
162
163
164
/* This function is equivalent to iscntrl(), except in that it only
 * handles non-high-bit control characters. */
bool is_ascii_cntrl_char(int c)
{
    return (0 <= c && c < 32);
}

165
/* This function is equivalent to iscntrl(), except in that it also
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
166
 * handles high-bit control characters. */
167
bool is_cntrl_char(int c)
168
{
169
    return ((c & 0x60) == 0 || c == 127);
170
171
172
173
174
175
176
177
178
}

/* This function is equivalent to iscntrl() for multibyte characters,
 * except in that it also handles multibyte control characters with
 * their high bits set. */
bool is_cntrl_mbchar(const char *c)
{
    assert(c != NULL);

179
#ifdef ENABLE_UTF8
180
    if (use_utf8) {
181
182
	return ((c[0] & 0xE0) == 0 || c[0] == 127 ||
		((signed char)c[0] == -62 && (signed char)c[1] < -96));
183
184
185
186
187
    } else
#endif
	return is_cntrl_char((unsigned char)*c);
}

188
189
/* This function is equivalent to ispunct() for multibyte characters. */
bool is_punct_mbchar(const char *c)
190
191
192
{
    assert(c != NULL);

193
#ifdef ENABLE_UTF8
194
    if (use_utf8) {
195
196
	wchar_t wc;

197
198
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
199
	    return 0;
200
	}
201

202
	return iswpunct(wc);
203
204
    } else
#endif
205
206
207
	return ispunct((unsigned char)*c);
}

208
209
210
/* Return TRUE when the given multibyte character c is a word-forming
 * character (that is: alphanumeric, or specified in wordchars, or
 * punctuation when allow_punct is TRUE), and FALSE otherwise. */
211
212
213
214
bool is_word_mbchar(const char *c, bool allow_punct)
{
    assert(c != NULL);

215
216
217
    if (*c == '\0')
	return FALSE;

218
219
220
221
    if (is_alnum_mbchar(c))
	return TRUE;

    if (word_chars != NULL && *word_chars != '\0') {
222
	bool wordforming;
223
224
225
226
	char *symbol = charalloc(MB_CUR_MAX + 1);
	int symlen = parse_mbchar(c, symbol, NULL);

	symbol[symlen] = '\0';
227
228
	wordforming = (strstr(word_chars, symbol) != NULL);
	free(symbol);
229

230
	return wordforming;
231
232
233
    }

    return (allow_punct && is_punct_mbchar(c));
234
235
}

236
/* Return the visible representation of control character c. */
237
char control_rep(const signed char c)
238
{
239
240
    assert(is_cntrl_char(c));

241
    /* An embedded newline is an encoded null. */
242
243
    if (c == '\n')
	return '@';
244
    else if (c == DEL_CODE)
245
	return '?';
246
247
248
249
    else if (c == -97)
	return '=';
    else if (c < 0)
	return c + 224;
250
251
252
253
    else
	return c + 64;
}

254
255
/* Return the visible representation of multibyte control character c. */
char control_mbrep(const char *c)
256
{
257
    assert(c != NULL);
258

259
#ifdef ENABLE_UTF8
260
    if (use_utf8) {
261
	if ((unsigned char)c[0] < 128)
262
	    return control_rep(c[0]);
263
	else
264
	    return control_rep(c[1]);
265
    } else
266
#endif
267
	return control_rep(*c);
268
269
}

270
271
/* Assess how many bytes the given (multibyte) character occupies.  Return -1
 * if the byte sequence is invalid, and return the number of bytes minus 8
272
273
274
 * when it encodes an invalid codepoint.  Also, in the second parameter,
 * return the number of columns that the character occupies. */
int length_of_char(const char *c, int *width)
275
{
276
    assert(c != NULL);
277
278

#ifdef ENABLE_UTF8
279
    if (use_utf8) {
280
	wchar_t wc;
281
	int charlen = mbtowc(&wc, c, MB_CUR_MAX);
282

283
284
	/* If the sequence is invalid... */
	if (charlen < 0) {
285
	    mbtowc_reset();
286
	    return -1;
287
	}
288
289
290
291

	/* If the codepoint is invalid... */
	if (!is_valid_unicode(wc))
	    return charlen - 8;
292
293
294
295
296
	else {
	    *width = wcwidth(wc);
	    /* If the codepoint is unassigned, assume a width of one. */
	    if (*width < 0)
		*width = 1;
297
	    return charlen;
298
	}
299
    } else
300
#endif
301
	return 1;
302
303
}

304
305
306
307
308
/* This function is equivalent to wcwidth() for multibyte characters. */
int mbwidth(const char *c)
{
    assert(c != NULL);

309
#ifdef ENABLE_UTF8
310
    if (use_utf8) {
311
	wchar_t wc;
312
	int width;
313

314
315
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	    mbtowc_reset();
316
	    return 1;
317
	}
318
319

	width = wcwidth(wc);
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
320

321
322
	if (width == -1)
	    return 1;
323
324
325
326
327
328
329
330
331
332

	return width;
    } else
#endif
	return 1;
}

/* Return the maximum width in bytes of a multibyte character. */
int mb_cur_max(void)
{
333
    return
334
#ifdef ENABLE_UTF8
335
	use_utf8 ? MB_CUR_MAX :
336
#endif
337
	1;
338
339
}

340
341
/* Convert the Unicode value in chr to a multibyte character with the
 * same wide character value as chr, if possible.  If the conversion
342
343
344
 * succeeds, return the (dynamically allocated) multibyte character and
 * its length.  Otherwise, return an undefined (dynamically allocated)
 * multibyte character and a length of zero. */
345
char *make_mbchar(long chr, int *chr_mb_len)
346
{
347
348
    char *chr_mb;

349
    assert(chr_mb_len != NULL);
350

351
#ifdef ENABLE_UTF8
352
    if (use_utf8) {
353
	chr_mb = charalloc(MB_CUR_MAX);
354
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);
355

356
	/* Reject invalid Unicode characters. */
357
358
	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
	    wctomb_reset();
359
	    *chr_mb_len = 0;
360
	}
361
    } else
362
#endif
363
    {
364
	*chr_mb_len = 1;
365
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
366
367
368
369
370
371
372
    }

    return chr_mb;
}

/* Parse a multibyte character from buf.  Return the number of bytes
 * used.  If chr isn't NULL, store the multibyte character in it.  If
373
374
375
 * col isn't NULL, store the new display width in it.  If *buf is '\t',
 * we expect col to have the current display width. */
int parse_mbchar(const char *buf, char *chr, size_t *col)
376
377
378
379
380
{
    int buf_mb_len;

    assert(buf != NULL);

381
#ifdef ENABLE_UTF8
382
    if (use_utf8) {
383
384
385
	/* Get the number of bytes in the multibyte character. */
	buf_mb_len = mblen(buf, MB_CUR_MAX);

386
	/* When the multibyte sequence is invalid, only take the first byte. */
387
	if (buf_mb_len < 0) {
388
	    IGNORE_CALL_RESULT(mblen(NULL, 0));
389
	    buf_mb_len = 1;
390
391
	} else if (buf_mb_len == 0)
	    buf_mb_len++;
392

393
	/* When requested, store the multibyte character in chr. */
394
395
	if (chr != NULL) {
	    int i;
396

397
398
399
400
	    for (i = 0; i < buf_mb_len; i++)
		chr[i] = buf[i];
	}

401
	/* When requested, store the width of the wide character in col. */
402
403
404
405
406
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
407
408
	    /* If we have a control character, it's two columns wide: one
	     * column for the "^", and one for the visible character. */
409
	    else if (is_cntrl_mbchar(buf)) {
410
		*col += 2;
411
412
413
414
415
	    /* If we have a normal character, get its width in columns
	     * normally. */
	    } else
		*col += mbwidth(buf);
	}
416
    } else
417
#endif
418
    {
419
	/* A byte character is one byte long. */
420
421
	buf_mb_len = 1;

422
	/* When requested, store the byte character in chr. */
423
424
425
	if (chr != NULL)
	    *chr = *buf;

426
	/* When requested, store the width of the wide character in col. */
427
428
429
430
431
	if (col != NULL) {
	    /* If we have a tab, get its width in columns using the
	     * current value of col. */
	    if (*buf == '\t')
		*col += tabsize - *col % tabsize;
432
433
	    /* If we have a control character, it's two columns wide: one
	     * column for the "^", and one for the visible character. */
434
435
436
437
438
439
440
441
442
443
	    else if (is_cntrl_char((unsigned char)*buf))
		*col += 2;
	    /* If we have a normal character, it's one column wide. */
	    else
		(*col)++;
	}
    }

    return buf_mb_len;
}
444
445
446
447
448

/* Return the index in buf of the beginning of the multibyte character
 * before the one at pos. */
size_t move_mbleft(const char *buf, size_t pos)
{
449
    size_t before, char_len = 0;
450

451
    assert(buf != NULL && pos <= strlen(buf));
452
453

    /* There is no library function to move backward one multibyte
454
455
456
457
458
459
460
     * character.  So we just start groping for one at the farthest
     * possible point. */
    if (mb_cur_max() > pos)
	before = 0;
    else
	before = pos - mb_cur_max();

461
462
463
    while (before < pos) {
	char_len = parse_mbchar(buf + before, NULL, NULL);
	before += char_len;
464
465
    }

466
    return before - char_len;
467
468
469
470
471
472
}

/* Return the index in buf of the beginning of the multibyte character
 * after the one at pos. */
size_t move_mbright(const char *buf, size_t pos)
{
473
    return pos + parse_mbchar(buf + pos, NULL, NULL);
474
}
475
476
477
478
479

#ifndef HAVE_STRCASECMP
/* This function is equivalent to strcasecmp(). */
int nstrcasecmp(const char *s1, const char *s2)
{
480
    return strncasecmp(s1, s2, HIGHEST_POSITIVE);
481
482
483
484
485
486
}
#endif

/* This function is equivalent to strcasecmp() for multibyte strings. */
int mbstrcasecmp(const char *s1, const char *s2)
{
487
    return mbstrncasecmp(s1, s2, HIGHEST_POSITIVE);
488
489
490
491
492
493
}

#ifndef HAVE_STRNCASECMP
/* This function is equivalent to strncasecmp(). */
int nstrncasecmp(const char *s1, const char *s2, size_t n)
{
494
495
496
    if (s1 == s2)
	return 0;

497
498
    assert(s1 != NULL && s2 != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
499
    for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) {
500
501
502
503
	if (tolower(*s1) != tolower(*s2))
	    break;
    }

504
    return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
505
506
507
}
#endif

508
/* This function is equivalent to strncasecmp() for multibyte strings. */
509
510
int mbstrncasecmp(const char *s1, const char *s2, size_t n)
{
511
#ifdef ENABLE_UTF8
512
    if (use_utf8) {
513
	wchar_t wc1, wc2;
514

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
515
516
	assert(s1 != NULL && s2 != NULL);

517
	while (*s1 != '\0' && *s2 != '\0' && n > 0) {
518
	    bool bad1 = FALSE, bad2 = FALSE;
519

520
	    if (mbtowc(&wc1, s1, MB_CUR_MAX) < 0) {
521
		mbtowc_reset();
522
		bad1 = TRUE;
523
524
	    }

525
	    if (mbtowc(&wc2, s2, MB_CUR_MAX) < 0) {
526
		mbtowc_reset();
527
		bad2 = TRUE;
528
529
	    }

530
531
532
533
	    if (bad1 || bad2) {
		if (*s1 != *s2)
		    return (unsigned char)*s1 - (unsigned char)*s2;

534
535
		if (bad1 != bad2)
		    return (bad1 ? 1 : -1);
536
537
	    } else {
		int difference = towlower(wc1) - towlower(wc2);
538

539
540
		if (difference != 0)
		    return difference;
541
542
	    }

543
544
545
	    s1 += move_mbright(s1, 0);
	    s2 += move_mbright(s2, 0);
	    n--;
546
547
	}

548
	return (n > 0) ? ((unsigned char)*s1 - (unsigned char)*s2) : 0;
549
550
    } else
#endif
551
	return strncasecmp(s1, s2, n);
552
553
554
}

#ifndef HAVE_STRCASESTR
555
/* This function is equivalent to strcasestr(). */
556
char *nstrcasestr(const char *haystack, const char *needle)
557
{
558
    size_t needle_len;
559

560
561
    assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
562
    if (*needle == '\0')
563
	return (char *)haystack;
564

565
    needle_len = strlen(needle);
566

567
    while (*haystack != '\0') {
568
	if (strncasecmp(haystack, needle, needle_len) == 0)
569
	    return (char *)haystack;
570
571

	haystack++;
572
573
574
575
576
577
    }

    return NULL;
}
#endif

578
/* This function is equivalent to strcasestr() for multibyte strings. */
579
char *mbstrcasestr(const char *haystack, const char *needle)
580
{
581
#ifdef ENABLE_UTF8
582
    if (use_utf8) {
583
	size_t needle_len;
584
585
586

	assert(haystack != NULL && needle != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
587
	if (*needle == '\0')
588
	    return (char *)haystack;
589

590
	needle_len = mbstrlen(needle);
591

592
	while (*haystack != '\0') {
593
	    if (mbstrncasecmp(haystack, needle, needle_len) == 0)
594
		return (char *)haystack;
595
596

	    haystack += move_mbright(haystack, 0);
597
598
	}

599
	return NULL;
600
601
    } else
#endif
602
	return (char *) strcasestr(haystack, needle);
603
604
}

605
#if !defined(NANO_TINY) || !defined(DISABLE_TABCOMP)
606
/* This function is equivalent to strstr(), except in that it scans the
607
 * string in reverse, starting at rev_start. */
608
609
char *revstrstr(const char *haystack, const char *needle, const char
	*rev_start)
610
{
611
612
    size_t rev_start_len, needle_len;

613
614
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
615
    if (*needle == '\0')
616
	return (char *)rev_start;
617

618
    needle_len = strlen(needle);
619

620
621
    if (strlen(haystack) < needle_len)
	return NULL;
622

623
624
625
626
627
    rev_start_len = strlen(rev_start);

    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncmp(rev_start, needle,
		needle_len) == 0)
628
	    return (char *)rev_start;
629
630
631
632
    }

    return NULL;
}
633
#endif /* !NANO_TINY || !DISABLE_TABCOMP */
634

635
#ifndef NANO_TINY
636
/* This function is equivalent to strcasestr(), except in that it scans
637
 * the string in reverse, starting at rev_start. */
638
639
char *revstrcasestr(const char *haystack, const char *needle, const char
	*rev_start)
640
{
641
642
    size_t rev_start_len, needle_len;

643
644
    assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
645
    if (*needle == '\0')
646
	return (char *)rev_start;
647

648
649
650
651
    needle_len = strlen(needle);

    if (strlen(haystack) < needle_len)
	return NULL;
652

653
    rev_start_len = strlen(rev_start);
654

655
656
657
    for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncasecmp(rev_start,
		needle, needle_len) == 0)
658
	    return (char *)rev_start;
659
660
661
662
    }

    return NULL;
}
663
664

/* This function is equivalent to strcasestr() for multibyte strings,
665
 * except in that it scans the string in reverse, starting at rev_start. */
666
667
char *mbrevstrcasestr(const char *haystack, const char *needle, const
	char *rev_start)
668
{
669
#ifdef ENABLE_UTF8
670
    if (use_utf8) {
671
	size_t rev_start_len, needle_len;
672
673
674

	assert(haystack != NULL && needle != NULL && rev_start != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
675
	if (*needle == '\0')
676
	    return (char *)rev_start;
677

678
	needle_len = mbstrlen(needle);
679

680
681
	if (mbstrlen(haystack) < needle_len)
	    return NULL;
682

683
	rev_start_len = mbstrlen(rev_start);
684

685
	while (TRUE) {
686
	    if (rev_start_len >= needle_len &&
687
			mbstrncasecmp(rev_start, needle, needle_len) == 0)
688
		return (char *)rev_start;
689

690
	    /* If we've reached the head of the haystack, we found nothing. */
691
	    if (rev_start == haystack)
692
		return NULL;
693

694
695
696
	    rev_start = haystack + move_mbleft(haystack, rev_start - haystack);
	    rev_start_len++;
	}
697
698
699
700
    } else
#endif
	return revstrcasestr(haystack, needle, rev_start);
}
701
#endif /* !NANO_TINY */
702

703
704
705
706
707
708
/* This function is equivalent to strlen() for multibyte strings. */
size_t mbstrlen(const char *s)
{
    return mbstrnlen(s, (size_t)-1);
}

709
710
711
712
713
714
715
716
#ifndef HAVE_STRNLEN
/* This function is equivalent to strnlen(). */
size_t nstrnlen(const char *s, size_t maxlen)
{
    size_t n = 0;

    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
717
    for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++)
718
719
720
721
722
723
724
725
726
727
728
	;

    return n;
}
#endif

/* This function is equivalent to strnlen() for multibyte strings. */
size_t mbstrnlen(const char *s, size_t maxlen)
{
    assert(s != NULL);

729
#ifdef ENABLE_UTF8
730
    if (use_utf8) {
731
732
	size_t n = 0;

733
734
735
	for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0),
		maxlen--, n++)
	    ;
736

737
	return n;
738
739
    } else
#endif
740
	return strnlen(s, maxlen);
741
}
742

743
#if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
744
/* This function is equivalent to strchr() for multibyte strings. */
745
char *mbstrchr(const char *s, const char *c)
746
747
748
749
{
    assert(s != NULL && c != NULL);

#ifdef ENABLE_UTF8
750
    if (use_utf8) {
David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
751
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
752
753
754
755
	char *s_mb = charalloc(MB_CUR_MAX);
	const char *q = s;
	wchar_t ws, wc;

Benno Schulenberg's avatar
Benno Schulenberg committed
756
	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
757
	    mbtowc_reset();
758
759
760
761
762
	    wc = (unsigned char)*c;
	    bad_c_mb = TRUE;
	}

	while (*s != '\0') {
763
	    int s_mb_len = parse_mbchar(s, s_mb, NULL);
764

765
	    if (mbtowc(&ws, s_mb, s_mb_len) < 0) {
766
		mbtowc_reset();
767
768
769
770
771
772
773
774
775
776
777
778
779
		ws = (unsigned char)*s;
		bad_s_mb = TRUE;
	    }

	    if (bad_s_mb == bad_c_mb && ws == wc)
		break;

	    s += s_mb_len;
	    q += s_mb_len;
	}

	free(s_mb);

780
	if (*s == '\0')
781
782
783
784
785
	    q = NULL;

	return (char *)q;
    } else
#endif
786
	return (char *) strchr(s, *c);
787
}
788
#endif /* !NANO_TINY || !DISABLE_JUSTIFY */
789

790
791
792
793
794
795
796
#ifndef NANO_TINY
/* This function is equivalent to strpbrk() for multibyte strings. */
char *mbstrpbrk(const char *s, const char *accept)
{
    assert(s != NULL && accept != NULL);

#ifdef ENABLE_UTF8
797
    if (use_utf8) {
798
	for (; *s != '\0'; s += move_mbright(s, 0)) {
799
800
801
802
803
804
805
	    if (mbstrchr(accept, s) != NULL)
		return (char *)s;
	}

	return NULL;
    } else
#endif
806
	return (char *) strpbrk(s, accept);
807
808
809
810
811
812
813
814
815
}

/* This function is equivalent to strpbrk(), except in that it scans the
 * string in reverse, starting at rev_start. */
char *revstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

816
817
818
819
820
    if (*rev_start == '\0') {
	if (rev_start == s)
	   return NULL;
	rev_start--;
    }
821

822
823
    for (; rev_start >= s; rev_start--) {
	if (strchr(accept, *rev_start) != NULL)
824
825
826
827
828
829
830
	    return (char *)rev_start;
    }

    return NULL;
}

/* This function is equivalent to strpbrk() for multibyte strings,
831
 * except in that it scans the string in reverse, starting at rev_start. */
832
833
834
835
836
837
char *mbrevstrpbrk(const char *s, const char *accept, const char
	*rev_start)
{
    assert(s != NULL && accept != NULL && rev_start != NULL);

#ifdef ENABLE_UTF8
838
    if (use_utf8) {
839
840
841
842
843
	if (*rev_start == '\0') {
	    if (rev_start == s)
		return NULL;
	    rev_start = s + move_mbleft(s, rev_start - s);
	}
844

845
846
	while (TRUE) {
	    if (mbstrchr(accept, rev_start) != NULL)
847
848
		return (char *)rev_start;

849
	    /* If we've reached the head of the string, we found nothing. */
850
	    if (rev_start == s)
851
		return NULL;
852

853
854
	    rev_start = s + move_mbleft(s, rev_start - s);
	}
855
856
857
858
859
860
    } else
#endif
	return revstrpbrk(s, accept, rev_start);
}
#endif /* !NANO_TINY */

861
#if !defined(DISABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
/* Return TRUE if the string s contains one or more blank characters,
 * and FALSE otherwise. */
bool has_blank_chars(const char *s)
{
    assert(s != NULL);

    for (; *s != '\0'; s++) {
	if (isblank(*s))
	    return TRUE;
    }

    return FALSE;
}

/* Return TRUE if the multibyte string s contains one or more blank
 * multibyte characters, and FALSE otherwise. */
bool has_blank_mbchars(const char *s)
{
880
    assert(s != NULL);
881

882
#ifdef ENABLE_UTF8
883
    if (use_utf8) {
884
	bool retval = FALSE;
885
	char *chr_mb = charalloc(MB_CUR_MAX);
886

887
888
	for (; *s != '\0'; s += move_mbright(s, 0)) {
	    parse_mbchar(s, chr_mb, NULL);
889
890
891
892
893
894
895
896
897
898
899
900
901
902

	    if (is_blank_mbchar(chr_mb)) {
		retval = TRUE;
		break;
	    }
	}

	free(chr_mb);

	return retval;
    } else
#endif
	return has_blank_chars(s);
}
903
#endif /* !DISABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */
904

905
#ifdef ENABLE_UTF8
906
/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
907
908
bool is_valid_unicode(wchar_t wc)
{
909
    return ((0 <= wc && wc <= 0xD7FF) ||
910
911
912
		(0xE000 <= wc && wc <= 0xFDCF) ||
		(0xFDF0 <= wc && wc <= 0xFFFD) ||
		(0xFFFF < wc && wc <= 0x10FFFF && (wc & 0xFFFF) <= 0xFFFD));
913
914
915
}
#endif

916
#ifndef DISABLE_NANORC
917
918
919
920
921
922
/* Check if the string s is a valid multibyte string.  Return TRUE if it
 * is, and FALSE otherwise. */
bool is_valid_mbstring(const char *s)
{
    assert(s != NULL);

David Lawrence Ramsey's avatar
David Lawrence Ramsey committed
923
    return
924
#ifdef ENABLE_UTF8
925
	use_utf8 ? (mbstowcs(NULL, s, 0) != (size_t)-1) :
926
927
928
#endif
	TRUE;
}
929
#endif /* !DISABLE_NANORC */