forked from NASAWorldWind/WorldWindJava
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUTF8Writer.java
More file actions
385 lines (342 loc) · 12.5 KB
/
UTF8Writer.java
File metadata and controls
385 lines (342 loc) · 12.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
package org.codehaus.jackson.io;
import java.io.*;
public final class UTF8Writer
extends Writer
{
final static int SURR1_FIRST = 0xD800;
final static int SURR1_LAST = 0xDBFF;
final static int SURR2_FIRST = 0xDC00;
final static int SURR2_LAST = 0xDFFF;
final protected IOContext mContext;
OutputStream mOut;
byte[] mOutBuffer;
final int mOutBufferLast;
int mOutPtr;
/**
* When outputting chars from BMP, surrogate pairs need to be coalesced.
* To do this, both pairs must be known first; and since it is possible
* pairs may be split, we need temporary storage for the first half
*/
int mSurrogate = 0;
public UTF8Writer(IOContext ctxt, OutputStream out)
{
mContext = ctxt;
mOut = out;
mOutBuffer = ctxt.allocWriteEncodingBuffer();
/* Max. expansion for a single char (in unmodified UTF-8) is
* 4 bytes (or 3 depending on how you view it -- 4 when recombining
* surrogate pairs)
*/
mOutBufferLast = mOutBuffer.length - 4;
mOutPtr = 0;
}
@Override
public Writer append(char c)
throws IOException
{
write(c);
return this;
}
@Override
public void close()
throws IOException
{
if (mOut != null) {
if (mOutPtr > 0) {
mOut.write(mOutBuffer, 0, mOutPtr);
mOutPtr = 0;
}
OutputStream out = mOut;
mOut = null;
byte[] buf = mOutBuffer;
if (buf != null) {
mOutBuffer = null;
mContext.releaseWriteEncodingBuffer(buf);
}
out.close();
/* Let's 'flush' orphan surrogate, no matter what; but only
* after cleanly closing everything else.
*/
int code = mSurrogate;
mSurrogate = 0;
if (code > 0) {
throwIllegal(code);
}
}
}
@Override
public void flush()
throws IOException
{
if (mOutPtr > 0) {
mOut.write(mOutBuffer, 0, mOutPtr);
mOutPtr = 0;
}
mOut.flush();
}
@Override
public void write(char[] cbuf)
throws IOException
{
write(cbuf, 0, cbuf.length);
}
@Override
public void write(char[] cbuf, int off, int len)
throws IOException
{
if (len < 2) {
if (len == 1) {
write(cbuf[off]);
}
return;
}
// First: do we have a leftover surrogate to deal with?
if (mSurrogate > 0) {
char second = cbuf[off++];
--len;
write(convertSurrogate(second));
// will have at least one more char
}
int outPtr = mOutPtr;
byte[] outBuf = mOutBuffer;
int outBufLast = mOutBufferLast; // has 4 'spare' bytes
// All right; can just loop it nice and easy now:
len += off; // len will now be the end of input buffer
output_loop:
for (; off < len; ) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
if (outPtr >= outBufLast) {
mOut.write(outBuf, 0, outPtr);
outPtr = 0;
}
int c = cbuf[off++];
// And then see if we have an Ascii char:
if (c < 0x80) { // If so, can do a tight inner loop:
outBuf[outPtr++] = (byte)c;
// Let's calc how many ascii chars we can copy at most:
int maxInCount = (len - off);
int maxOutCount = (outBufLast - outPtr);
if (maxInCount > maxOutCount) {
maxInCount = maxOutCount;
}
maxInCount += off;
ascii_loop:
while (true) {
if (off >= maxInCount) { // done with max. ascii seq
continue output_loop;
}
c = cbuf[off++];
if (c >= 0x80) {
break ascii_loop;
}
outBuf[outPtr++] = (byte) c;
}
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) {
outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
mOutPtr = outPtr;
throwIllegal(c);
}
mSurrogate = c;
// and if so, followed by another from next range
if (off >= len) { // unless we hit the end?
break;
}
c = convertSurrogate(cbuf[off++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
mOutPtr = outPtr;
throwIllegal(c);
}
outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
}
}
mOutPtr = outPtr;
}
@Override
public void write(int c) throws IOException
{
// First; do we have a left over surrogate?
if (mSurrogate > 0) {
c = convertSurrogate(c);
// If not, do we start with a surrogate?
} else if (c >= SURR1_FIRST && c <= SURR2_LAST) {
// Illegal to get second part without first:
if (c > SURR1_LAST) {
throwIllegal(c);
}
// First part just needs to be held for now
mSurrogate = c;
return;
}
if (mOutPtr >= mOutBufferLast) { // let's require enough room, first
mOut.write(mOutBuffer, 0, mOutPtr);
mOutPtr = 0;
}
if (c < 0x80) { // ascii
mOutBuffer[mOutPtr++] = (byte) c;
} else {
int ptr = mOutPtr;
if (c < 0x800) { // 2-byte
mOutBuffer[ptr++] = (byte) (0xc0 | (c >> 6));
mOutBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
} else if (c <= 0xFFFF) { // 3 bytes
mOutBuffer[ptr++] = (byte) (0xe0 | (c >> 12));
mOutBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
mOutBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
} else { // 4 bytes
if (c > 0x10FFFF) { // illegal
throwIllegal(c);
}
mOutBuffer[ptr++] = (byte) (0xf0 | (c >> 18));
mOutBuffer[ptr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
mOutBuffer[ptr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
mOutBuffer[ptr++] = (byte) (0x80 | (c & 0x3f));
}
mOutPtr = ptr;
}
}
@Override
public void write(String str) throws IOException
{
write(str, 0, str.length());
}
@Override
public void write(String str, int off, int len) throws IOException
{
if (len < 2) {
if (len == 1) {
write(str.charAt(off));
}
return;
}
// First: do we have a leftover surrogate to deal with?
if (mSurrogate > 0) {
char second = str.charAt(off++);
--len;
write(convertSurrogate(second));
// will have at least one more char (case of 1 char was checked earlier on)
}
int outPtr = mOutPtr;
byte[] outBuf = mOutBuffer;
int outBufLast = mOutBufferLast; // has 4 'spare' bytes
// All right; can just loop it nice and easy now:
len += off; // len will now be the end of input buffer
output_loop:
for (; off < len; ) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
if (outPtr >= outBufLast) {
mOut.write(outBuf, 0, outPtr);
outPtr = 0;
}
int c = str.charAt(off++);
// And then see if we have an Ascii char:
if (c < 0x80) { // If so, can do a tight inner loop:
outBuf[outPtr++] = (byte)c;
// Let's calc how many ascii chars we can copy at most:
int maxInCount = (len - off);
int maxOutCount = (outBufLast - outPtr);
if (maxInCount > maxOutCount) {
maxInCount = maxOutCount;
}
maxInCount += off;
ascii_loop:
while (true) {
if (off >= maxInCount) { // done with max. ascii seq
continue output_loop;
}
c = str.charAt(off++);
if (c >= 0x80) {
break ascii_loop;
}
outBuf[outPtr++] = (byte) c;
}
}
// Nope, multi-byte:
if (c < 0x800) { // 2-byte
outBuf[outPtr++] = (byte) (0xc0 | (c >> 6));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
} else { // 3 or 4 bytes
// Surrogates?
if (c < SURR1_FIRST || c > SURR2_LAST) {
outBuf[outPtr++] = (byte) (0xe0 | (c >> 12));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
mOutPtr = outPtr;
throwIllegal(c);
}
mSurrogate = c;
// and if so, followed by another from next range
if (off >= len) { // unless we hit the end?
break;
}
c = convertSurrogate(str.charAt(off++));
if (c > 0x10FFFF) { // illegal, as per RFC 4627
mOutPtr = outPtr;
throwIllegal(c);
}
outBuf[outPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outPtr++] = (byte) (0x80 | (c & 0x3f));
}
}
mOutPtr = outPtr;
}
/*
////////////////////////////////////////////////////////////
// Internal methods
////////////////////////////////////////////////////////////
*/
/**
* Method called to calculate UTF codepoint, from a surrogate pair.
*/
private int convertSurrogate(int secondPart)
throws IOException
{
int firstPart = mSurrogate;
mSurrogate = 0;
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
throw new IOException("Broken surrogate pair: first char 0x"+Integer.toHexString(firstPart)+", second 0x"+Integer.toHexString(secondPart)+"; illegal combination");
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
}
private void throwIllegal(int code)
throws IOException
{
if (code > 0x10FFFF) { // over max?
throw new IOException("Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 4627");
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
throw new IOException("Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")");
}
throw new IOException("Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")");
}
// should we ever get this?
throw new IOException("Illegal character point (0x"+Integer.toHexString(code)+") to output");
}
}