X Tutup
Skip to content

Commit af3f4e4

Browse files
committed
tool_cb_wrt: fix invalid unicode for windows console
- Suppress an incomplete UTF-8 sequence at the end of the buffer. - Attempt to reconstruct incomplete UTF-8 sequence from prior call(s) in current call. Prior to this change, in Windows console UTF-8 sequences split between two or more calls to the write callback would cause invalid "replacement characters" U+FFFD to be printed instead of the actual Unicode character. This is because in Windows only UTF-16 encoded characters are printed to the console, therefore we convert the UTF-8 contents to UTF-16, which cannot be done with partial UTF-8 sequences. Reported-by: Maksim Arhipov Fixes #9841 Closes #10890
1 parent 0b947e8 commit af3f4e4

File tree

4 files changed

+135
-20
lines changed

4 files changed

+135
-20
lines changed

src/tool_cb_hdr.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@ size_t tool_header_cb(char *ptr, size_t size, size_t nmemb, void *userdata)
8787
}
8888
#endif
8989

90+
#ifdef WIN32
91+
/* Discard incomplete UTF-8 sequence buffered from body */
92+
if(outs->utf8seq[0])
93+
memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
94+
#endif
95+
9096
/*
9197
* Write header data when curl option --dump-header (-D) is given.
9298
*/

src/tool_cb_wrt.c

Lines changed: 117 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -233,35 +233,132 @@ size_t tool_write_cb(char *buffer, size_t sz, size_t nmemb, void *userdata)
233233

234234
#ifdef WIN32
235235
fhnd = _get_osfhandle(fileno(outs->stream));
236+
/* if windows console then UTF-8 must be converted to UTF-16 */
236237
if(isatty(fileno(outs->stream)) &&
237238
GetConsoleScreenBufferInfo((HANDLE)fhnd, &console_info)) {
238-
DWORD in_len = (DWORD)(sz * nmemb);
239-
wchar_t* wc_buf;
239+
wchar_t *wc_buf;
240240
DWORD wc_len;
241+
unsigned char *rbuf = (unsigned char *)buffer;
242+
DWORD rlen = (DWORD)bytes;
241243

242-
/* calculate buffer size for wide characters */
243-
wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, NULL, 0);
244-
wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t));
245-
if(!wc_buf)
246-
return CURL_WRITEFUNC_ERROR;
244+
#define IS_TRAILING_BYTE(x) (0x80 <= (x) && (x) < 0xC0)
247245

248-
/* calculate buffer size for multi-byte characters */
249-
wc_len = MultiByteToWideChar(CP_UTF8, 0, buffer, in_len, wc_buf, wc_len);
250-
if(!wc_len) {
251-
free(wc_buf);
252-
return CURL_WRITEFUNC_ERROR;
246+
/* attempt to complete an incomplete UTF-8 sequence from previous call.
247+
the sequence does not have to be well-formed. */
248+
if(outs->utf8seq[0] && rlen) {
249+
bool complete = false;
250+
/* two byte sequence (lead byte 110yyyyy) */
251+
if(0xC0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xE0) {
252+
outs->utf8seq[1] = *rbuf++;
253+
--rlen;
254+
complete = true;
255+
}
256+
/* three byte sequence (lead byte 1110zzzz) */
257+
else if(0xE0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF0) {
258+
if(!outs->utf8seq[1]) {
259+
outs->utf8seq[1] = *rbuf++;
260+
--rlen;
261+
}
262+
if(rlen && !outs->utf8seq[2]) {
263+
outs->utf8seq[2] = *rbuf++;
264+
--rlen;
265+
complete = true;
266+
}
267+
}
268+
/* four byte sequence (lead byte 11110uuu) */
269+
else if(0xF0 <= outs->utf8seq[0] && outs->utf8seq[0] < 0xF8) {
270+
if(!outs->utf8seq[1]) {
271+
outs->utf8seq[1] = *rbuf++;
272+
--rlen;
273+
}
274+
if(rlen && !outs->utf8seq[2]) {
275+
outs->utf8seq[2] = *rbuf++;
276+
--rlen;
277+
}
278+
if(rlen && !outs->utf8seq[3]) {
279+
outs->utf8seq[3] = *rbuf++;
280+
--rlen;
281+
complete = true;
282+
}
283+
}
284+
285+
if(complete) {
286+
WCHAR prefix[3] = {0}; /* UTF-16 (1-2 WCHARs) + NUL */
287+
288+
if(MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)outs->utf8seq, -1,
289+
prefix, sizeof(prefix)/sizeof(prefix[0]))) {
290+
DEBUGASSERT(prefix[3] == L'\0');
291+
if(!WriteConsoleW(
292+
(HANDLE) fhnd,
293+
prefix,
294+
prefix[1] ? 2 : 1,
295+
NULL,
296+
NULL)) {
297+
return CURL_WRITEFUNC_ERROR;
298+
}
299+
}
300+
/* else: UTF-8 input was not well formed and OS is pre-Vista which
301+
drops invalid characters instead of writing U+FFFD to output. */
302+
303+
memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
304+
}
253305
}
254306

255-
if(!WriteConsoleW(
256-
(HANDLE) fhnd,
257-
wc_buf,
258-
wc_len,
259-
&wc_len,
260-
NULL)) {
307+
/* suppress an incomplete utf-8 sequence at end of rbuf */
308+
if(!outs->utf8seq[0] && rlen && (rbuf[rlen - 1] & 0x80)) {
309+
/* check for lead byte from a two, three or four byte sequence */
310+
if(0xC0 <= rbuf[rlen - 1] && rbuf[rlen - 1] < 0xF8) {
311+
outs->utf8seq[0] = rbuf[rlen - 1];
312+
rlen -= 1;
313+
}
314+
else if(rlen >= 2 && IS_TRAILING_BYTE(rbuf[rlen - 1])) {
315+
/* check for lead byte from a three or four byte sequence */
316+
if(0xE0 <= rbuf[rlen - 2] && rbuf[rlen - 2] < 0xF8) {
317+
outs->utf8seq[0] = rbuf[rlen - 2];
318+
outs->utf8seq[1] = rbuf[rlen - 1];
319+
rlen -= 2;
320+
}
321+
else if(rlen >= 3 && IS_TRAILING_BYTE(rbuf[rlen - 2])) {
322+
/* check for lead byte from a four byte sequence */
323+
if(0xF0 <= rbuf[rlen - 3] && rbuf[rlen - 3] < 0xF8) {
324+
outs->utf8seq[0] = rbuf[rlen - 3];
325+
outs->utf8seq[1] = rbuf[rlen - 2];
326+
outs->utf8seq[2] = rbuf[rlen - 1];
327+
rlen -= 3;
328+
}
329+
}
330+
}
331+
}
332+
333+
if(rlen) {
334+
/* calculate buffer size for wide characters */
335+
wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, NULL, 0);
336+
if(!wc_len)
337+
return CURL_WRITEFUNC_ERROR;
338+
339+
wc_buf = (wchar_t*) malloc(wc_len * sizeof(wchar_t));
340+
if(!wc_buf)
341+
return CURL_WRITEFUNC_ERROR;
342+
343+
wc_len = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR)rbuf, rlen, wc_buf,
344+
wc_len);
345+
if(!wc_len) {
346+
free(wc_buf);
347+
return CURL_WRITEFUNC_ERROR;
348+
}
349+
350+
if(!WriteConsoleW(
351+
(HANDLE) fhnd,
352+
wc_buf,
353+
wc_len,
354+
NULL,
355+
NULL)) {
356+
free(wc_buf);
357+
return CURL_WRITEFUNC_ERROR;
358+
}
261359
free(wc_buf);
262-
return CURL_WRITEFUNC_ERROR;
263360
}
264-
free(wc_buf);
361+
265362
rc = bytes;
266363
}
267364
else

src/tool_operate.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,12 @@ static CURLcode post_per_transfer(struct GlobalConfig *global,
464464
}
465465
}
466466

467+
#ifdef WIN32
468+
/* Discard incomplete UTF-8 sequence buffered from body */
469+
if(outs->utf8seq[0])
470+
memset(outs->utf8seq, 0, sizeof(outs->utf8seq));
471+
#endif
472+
467473
/* if retry-max-time is non-zero, make sure we haven't exceeded the
468474
time */
469475
if(per->retry_numretries &&

src/tool_sdecls.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@
5757
* 'init' member holds original file size or offset at which truncation is
5858
* taking place. Always zero unless appending to a non-empty regular file.
5959
*
60+
* [Windows]
61+
* 'utf8seq' member holds an incomplete UTF-8 sequence destined for the console
62+
* until it can be completed (1-4 bytes) + NUL.
6063
*/
6164

6265
struct OutStruct {
@@ -68,6 +71,9 @@ struct OutStruct {
6871
FILE *stream;
6972
curl_off_t bytes;
7073
curl_off_t init;
74+
#ifdef WIN32
75+
unsigned char utf8seq[5];
76+
#endif
7177
};
7278

7379
/*

0 commit comments

Comments
 (0)
X Tutup