=== modified file 't/03utf8.c' --- t/03utf8.c 2011-03-31 22:42:52 +0000 +++ t/03utf8.c 2011-03-31 17:59:07 +0000 @@ -7,7 +7,7 @@ TermKey *tk; TermKeyKey key; - plan_tests(57); + plan_tests(21); pipe(fd); @@ -72,97 +72,6 @@ is_int(key.type, TERMKEY_TYPE_UNICODE, "key.type UTF-8 4 high"); is_int(key.code.number, 0x10FFFF, "key.code.number UTF-8 4 high"); - /* Invalid continuations */ - - write(fd[1], "\xC2!", 2); - - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 invalid cont"); - is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 2 invalid cont"); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 invalid after"); - is_int(key.code.number, '!', "key.code.number UTF-8 2 invalid after"); - - write(fd[1], "\xE0!", 2); - - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid cont"); - is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 3 invalid cont"); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid after"); - is_int(key.code.number, '!', "key.code.number UTF-8 3 invalid after"); - - write(fd[1], "\xE0\xA0!", 3); - - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid cont 2"); - is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 3 invalid cont 2"); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 invalid after"); - is_int(key.code.number, '!', "key.code.number UTF-8 3 invalid after"); - - write(fd[1], "\xF0!", 2); - - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont"); - is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont"); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after"); - is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after"); - - write(fd[1], "\xF0\x90!", 3); - - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont 2"); - is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont 2"); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after"); - is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after"); - - write(fd[1], "\xF0\x90\x80!", 4); - - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid cont 3"); - is_int(key.code.number, 0xFFFD, "key.code.number UTF-8 4 invalid cont 3"); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 invalid after"); - is_int(key.code.number, '!', "key.code.number UTF-8 4 invalid after"); - - /* Partials */ - - write(fd[1], "\xC2", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 2 partial"); - - write(fd[1], "\xA0", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 2 partial"); - is_int(key.code.number, 0x00A0, "key.code.number UTF-8 2 partial"); - - write(fd[1], "\xE0", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 3 partial"); - - write(fd[1], "\xA0", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 3 partial"); - - write(fd[1], "\x80", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 3 partial"); - is_int(key.code.number, 0x0800, "key.code.number UTF-8 3 partial"); - - write(fd[1], "\xF0", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial"); - - write(fd[1], "\x90", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial"); - - write(fd[1], "\x80", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_AGAIN, "getkey yields RES_AGAIN UTF-8 4 partial"); - - write(fd[1], "\x80", 1); - termkey_advisereadable(tk); - is_int(termkey_getkey(tk, &key), TERMKEY_RES_KEY, "getkey yields RES_KEY UTF-8 4 partial"); - is_int(key.code.number, 0x10000, "key.code.number UTF-8 4 partial"); - termkey_destroy(tk); return exit_status(); === modified file 'termkey.c' --- termkey.c 2011-03-31 22:42:52 +0000 +++ termkey.c 2011-03-31 13:07:55 +0000 @@ -422,76 +422,6 @@ } } -#define UTF8_INVALID 0xFFFD -static TermKeyResult parse_utf8(const unsigned char *bytes, size_t len, long *cp, size_t *nbytep) -{ - unsigned int nbytes; - - unsigned char b0 = bytes[0]; - - if(b0 < 0xc0) { - // Starts with a continuation byte - that's not right - *cp = UTF8_INVALID; - *nbytep = 1; - return TERMKEY_RES_KEY; - } - else if(b0 < 0xe0) { - nbytes = 2; - *cp = b0 & 0x1f; - } - else if(b0 < 0xf0) { - nbytes = 3; - *cp = b0 & 0x0f; - } - else if(b0 < 0xf8) { - nbytes = 4; - *cp = b0 & 0x07; - } - else if(b0 < 0xfc) { - nbytes = 5; - *cp = b0 & 0x03; - } - else if(b0 < 0xfe) { - nbytes = 6; - *cp = b0 & 0x01; - } - else { - *cp = UTF8_INVALID; - *nbytep = 1; - return TERMKEY_RES_KEY; - } - - for(unsigned int b = 1; b < nbytes; b++) { - unsigned char cb; - - if(b >= len) - return TERMKEY_RES_AGAIN; - - cb = bytes[b]; - if(cb < 0x80 || cb >= 0xc0) { - *cp = UTF8_INVALID; - *nbytep = b; - return TERMKEY_RES_KEY; - } - - *cp <<= 6; - *cp |= cb & 0x3f; - } - - // Check for overlong sequences - if(nbytes > utf8_seqlen(*cp)) - *cp = UTF8_INVALID; - - // Check for UTF-16 surrogates or invalid *cps - if((*cp >= 0xD800 && *cp <= 0xDFFF) || - *cp == 0xFFFE || - *cp == 0xFFFF) - *cp = UTF8_INVALID; - - *nbytep = nbytes; - return TERMKEY_RES_KEY; -} - static void emit_codepoint(TermKey *tk, long codepoint, TermKeyKey *key) { if(codepoint < 0x20) { @@ -557,6 +487,8 @@ fill_utf8(key); } +#define UTF8_INVALID 0xFFFD + static TermKeyResult peekkey(TermKey *tk, TermKeyKey *key, int force, size_t *nbytep) { int again = 0; @@ -672,24 +604,83 @@ } else if(tk->flags & TERMKEY_FLAG_UTF8) { // Some UTF-8 + unsigned int nbytes; long codepoint; - TermKeyResult res = parse_utf8(tk->buffer + tk->buffstart, tk->buffcount, &codepoint, nbytep); - - if(res == TERMKEY_RES_AGAIN && force) { + + key->type = TERMKEY_TYPE_UNICODE; + key->modifiers = 0; + + if(b0 < 0xc0) { + // Starts with a continuation byte - that's not right + (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key); + *nbytep = 1; + return TERMKEY_RES_KEY; + } + else if(b0 < 0xe0) { + nbytes = 2; + codepoint = b0 & 0x1f; + } + else if(b0 < 0xf0) { + nbytes = 3; + codepoint = b0 & 0x0f; + } + else if(b0 < 0xf8) { + nbytes = 4; + codepoint = b0 & 0x07; + } + else if(b0 < 0xfc) { + nbytes = 5; + codepoint = b0 & 0x03; + } + else if(b0 < 0xfe) { + nbytes = 6; + codepoint = b0 & 0x01; + } + else { + (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key); + *nbytep = 1; + return TERMKEY_RES_KEY; + } + + if(tk->buffcount < nbytes) { + if(!force) + return TERMKEY_RES_AGAIN; + /* There weren't enough bytes for a complete UTF-8 sequence but caller * demands an answer. About the best thing we can do here is eat as many * bytes as we have, and emit a UTF8_INVALID. If the remaining bytes * arrive later, they'll be invalid too. */ - codepoint = UTF8_INVALID; + (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key); *nbytep = tk->buffcount; - res = TERMKEY_RES_KEY; - } - - key->type = TERMKEY_TYPE_UNICODE; - key->modifiers = 0; + return TERMKEY_RES_KEY; + } + + for(unsigned int b = 1; b < nbytes; b++) { + unsigned char cb = CHARAT(b); + if(cb < 0x80 || cb >= 0xc0) { + (*tk->method.emit_codepoint)(tk, UTF8_INVALID, key); + *nbytep = b - 1; + return TERMKEY_RES_KEY; + } + + codepoint <<= 6; + codepoint |= cb & 0x3f; + } + + // Check for overlong sequences + if(nbytes > utf8_seqlen(codepoint)) + codepoint = UTF8_INVALID; + + // Check for UTF-16 surrogates or invalid codepoints + if((codepoint >= 0xD800 && codepoint <= 0xDFFF) || + codepoint == 0xFFFE || + codepoint == 0xFFFF) + codepoint = UTF8_INVALID; + (*tk->method.emit_codepoint)(tk, codepoint, key); - return res; + *nbytep = nbytes; + return TERMKEY_RES_KEY; } else { // Non UTF-8 case - just report the raw byte