Commit 8c55d21b authored by David Lawrence Ramsey's avatar David Lawrence Ramsey
Browse files

add better handling of invalid Unicode, plus a few miscellaneous minor

fixes


git-svn-id: svn://svn.savannah.gnu.org/nano/trunk/nano@2973 35c25a1d-7b9e-4130-9fde-d3aeb78583b8
No related merge requests found
Showing with 41 additions and 28 deletions
+41 -28
...@@ -137,9 +137,15 @@ CVS code - ...@@ -137,9 +137,15 @@ CVS code -
- color.c: - color.c:
- Remove unneeded fcntl.h include. (DLR) - Remove unneeded fcntl.h include. (DLR)
- chars.c: - chars.c:
control_rep(), control_mbrep()
- Assert that the multibyte character passed in is a control
character if it's valid. (DLR)
mbrep() mbrep()
- New function, the equivalent of control_mbrep() for non-control - New function, the equivalent of control_mbrep() for non-control
characters. (DLR) characters. (DLR)
- Treat the Unicode characters D800-DFFF and FFFE-FFFF as
invalid, since the C library's multibyte functions don't seem
to. (DLR)
parse_mbchar() parse_mbchar()
- Remove now-unneeded bad_chr parameter. (DLR) - Remove now-unneeded bad_chr parameter. (DLR)
mbstrchr() mbstrchr()
...@@ -263,10 +269,13 @@ CVS code - ...@@ -263,10 +269,13 @@ CVS code -
as wc does. (DLR) as wc does. (DLR)
- winio.c: - winio.c:
get_word_kbinput() get_word_kbinput()
- Don't allow the input word to be between hexadecimal D800 to - Multiply the entered digits by hexadecimal numbers instead of
DFFF or hexadecimal FFFE to FFFD, as they are invalid Unicode decimal numbers for clarity, rename to get_unicode_kbinput(),
characters; rename variables word and word_digits to uni and and rename variables word and word_digits to uni and
uni_digits; and rename to get_unicode_kbinput(). (DLR) uni_digits. (DLR)
parse_verbatim_kbinput()
- Rename variables word_mb and word_mb_len to uni_mb and
uni_mb_len. (DLR)
display_string() display_string()
- Instead of using parse_mbchar()'s bad_chr parameter, use - Instead of using parse_mbchar()'s bad_chr parameter, use
mbrep() to get the representation of a bad character. (DLR) mbrep() to get the representation of a bad character. (DLR)
......
...@@ -184,6 +184,8 @@ bool is_word_mbchar(const char *c, bool allow_punct) ...@@ -184,6 +184,8 @@ bool is_word_mbchar(const char *c, bool allow_punct)
* is (c + 64). We return that character. */ * is (c + 64). We return that character. */
char control_rep(char c) char control_rep(char c)
{ {
assert(is_cntrl_char(c));
/* Treat newlines embedded in a line as encoded nulls. */ /* Treat newlines embedded in a line as encoded nulls. */
if (c == '\n') if (c == '\n')
return '@'; return '@';
...@@ -198,6 +200,8 @@ char control_rep(char c) ...@@ -198,6 +200,8 @@ char control_rep(char c)
* where ch is (c + 64). We return that wide character. */ * where ch is (c + 64). We return that wide character. */
wchar_t control_wrep(wchar_t wc) wchar_t control_wrep(wchar_t wc)
{ {
assert(is_cntrl_wchar(wc));
/* Treat newlines embedded in a line as encoded nulls. */ /* Treat newlines embedded in a line as encoded nulls. */
if (wc == '\n') if (wc == '\n')
return '@'; return '@';
...@@ -251,7 +255,10 @@ char *mbrep(const char *c, char *crep, int *crep_len) ...@@ -251,7 +255,10 @@ char *mbrep(const char *c, char *crep, int *crep_len)
if (ISSET(USE_UTF8)) { if (ISSET(USE_UTF8)) {
wchar_t wc; wchar_t wc;
if (mbtowc(&wc, c, MB_CUR_MAX) < 0) { /* Unicode D800-DFFF and FFFE-FFFF are invalid, even though
* they're parsed properly. */
if (mbtowc(&wc, c, MB_CUR_MAX) < 0 || ((0xD800 <= wc && wc <=
0xDFFF) || (0XFFFE <= wc && wc <= 0xFFFF))) {
mbtowc(NULL, NULL, 0); mbtowc(NULL, NULL, 0);
crep = (char *)bad_mbchar; crep = (char *)bad_mbchar;
*crep_len = bad_mbchar_len; *crep_len = bad_mbchar_len;
......
...@@ -1232,8 +1232,8 @@ int get_byte_kbinput(int kbinput ...@@ -1232,8 +1232,8 @@ int get_byte_kbinput(int kbinput
} }
/* Translate a Unicode sequence: turn a four-digit hexadecimal number /* Translate a Unicode sequence: turn a four-digit hexadecimal number
* from 0000 to D7FF or E000 to FFFD (case-insensitive) into its * from 0000 to FFFF(case-insensitive) into its corresponding multibyte
* corresponding multibyte value. */ * value. */
int get_unicode_kbinput(int kbinput int get_unicode_kbinput(int kbinput
#ifndef NANO_SMALL #ifndef NANO_SMALL
, bool reset , bool reset
...@@ -1273,11 +1273,9 @@ int get_unicode_kbinput(int kbinput ...@@ -1273,11 +1273,9 @@ int get_unicode_kbinput(int kbinput
case 2: case 2:
/* Two digits: add the digit we got to the 0x100's position /* Two digits: add the digit we got to the 0x100's position
* of the Unicode sequence holder. */ * of the Unicode sequence holder. */
if (('0' <= kbinput && kbinput <= '7') || (uni != 0xD000 && if ('0' <= kbinput && kbinput <= '9')
'8' <= kbinput && kbinput <= '9'))
uni += (kbinput - '0') * 0x100; uni += (kbinput - '0') * 0x100;
else if (uni != 0xd000 && 'a' <= tolower(kbinput) && else if ('a' <= tolower(kbinput) && tolower(kbinput) <= 'f')
tolower(kbinput) <= 'f')
uni += (tolower(kbinput) + 10 - 'a') * 0x100; uni += (tolower(kbinput) + 10 - 'a') * 0x100;
else else
/* If the character we got isn't a hexadecimal digit, or /* If the character we got isn't a hexadecimal digit, or
...@@ -1305,9 +1303,8 @@ int get_unicode_kbinput(int kbinput ...@@ -1305,9 +1303,8 @@ int get_unicode_kbinput(int kbinput
if ('0' <= kbinput && kbinput <= '9') { if ('0' <= kbinput && kbinput <= '9') {
uni += (kbinput - '0'); uni += (kbinput - '0');
retval = uni; retval = uni;
} else if (('a' <= tolower(kbinput) && } else if ('a' <= tolower(kbinput) && tolower(kbinput) <=
tolower(kbinput) <= 'd') || (uni != 0xFFF0 && 'e' <= 'f') {
tolower(kbinput) && tolower(kbinput) <= 'f')) {
uni += (tolower(kbinput) + 10 - 'a'); uni += (tolower(kbinput) + 10 - 'a');
retval = uni; retval = uni;
} else } else
...@@ -1418,13 +1415,13 @@ int *get_verbatim_kbinput(WINDOW *win, size_t *kbinput_len) ...@@ -1418,13 +1415,13 @@ int *get_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
* that, leave the input as-is. */ * that, leave the input as-is. */
int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len) int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
{ {
int *kbinput, word, *retval; int *kbinput, uni, *retval;
/* Read in the first keystroke. */ /* Read in the first keystroke. */
while ((kbinput = get_input(win, 1)) == NULL); while ((kbinput = get_input(win, 1)) == NULL);
/* Check whether the first keystroke is a hexadecimal digit. */ /* Check whether the first keystroke is a hexadecimal digit. */
word = get_unicode_kbinput(*kbinput uni = get_unicode_kbinput(*kbinput
#ifndef NANO_SMALL #ifndef NANO_SMALL
, FALSE , FALSE
#endif #endif
...@@ -1432,36 +1429,36 @@ int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len) ...@@ -1432,36 +1429,36 @@ int *parse_verbatim_kbinput(WINDOW *win, size_t *kbinput_len)
/* If the first keystroke isn't a hexadecimal digit, put back the /* If the first keystroke isn't a hexadecimal digit, put back the
* first keystroke. */ * first keystroke. */
if (word != ERR) if (uni != ERR)
unget_input(kbinput, 1); unget_input(kbinput, 1);
/* Otherwise, read in keystrokes until we have a complete word /* Otherwise, read in keystrokes until we have a complete word
* sequence, and put back the corresponding word value. */ * sequence, and put back the corresponding word value. */
else { else {
char *word_mb; char *uni_mb;
int word_mb_len, *seq, i; int uni_mb_len, *seq, i;
while (word == ERR) { while (uni == ERR) {
while ((kbinput = get_input(win, 1)) == NULL); while ((kbinput = get_input(win, 1)) == NULL);
word = get_unicode_kbinput(*kbinput uni = get_unicode_kbinput(*kbinput
#ifndef NANO_SMALL #ifndef NANO_SMALL
, FALSE , FALSE
#endif #endif
); );
} }
/* Put back the multibyte equivalent of the word value. */ /* Put back the multibyte equivalent of the Unicode value. */
word_mb = make_mbchar(word, &word_mb_len); uni_mb = make_mbchar(uni, &uni_mb_len);
seq = (int *)nmalloc(word_mb_len * sizeof(int)); seq = (int *)nmalloc(uni_mb_len * sizeof(int));
for (i = 0; i < word_mb_len; i++) for (i = 0; i < uni_mb_len; i++)
seq[i] = (unsigned char)word_mb[i]; seq[i] = (unsigned char)uni_mb[i];
unget_input(seq, word_mb_len); unget_input(seq, uni_mb_len);
free(seq); free(seq);
free(word_mb); free(uni_mb);
} }
/* Get the complete sequence, and save the characters in it as the /* Get the complete sequence, and save the characters in it as the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment