X Tutup
Skip to content

Commit a24ee6a

Browse files
committed
fix(HtmlLexer): fix for unicode chars
fixes #6036 Closes #6061
1 parent df3074f commit a24ee6a

File tree

2 files changed

+99
-61
lines changed

2 files changed

+99
-61
lines changed

modules/angular2/src/compiler/html_lexer.ts

Lines changed: 76 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,13 @@ const $LT = 60;
7373
const $EQ = 61;
7474
const $GT = 62;
7575
const $QUESTION = 63;
76-
const $A = 65;
77-
const $Z = 90;
7876
const $LBRACKET = 91;
7977
const $RBRACKET = 93;
78+
const $A = 65;
79+
const $F = 70;
80+
const $X = 88;
81+
const $Z = 90;
82+
8083
const $a = 97;
8184
const $f = 102;
8285
const $z = 122;
@@ -102,7 +105,6 @@ class ControlFlowError {
102105
// See http://www.w3.org/TR/html51/syntax.html#writing
103106
class _HtmlTokenizer {
104107
private input: string;
105-
private inputLowercase: string;
106108
private length: number;
107109
// Note: this is always lowercase!
108110
private peek: number = -1;
@@ -117,7 +119,6 @@ class _HtmlTokenizer {
117119

118120
constructor(private file: ParseSourceFile) {
119121
this.input = file.content;
120-
this.inputLowercase = file.content.toLowerCase();
121122
this.length = file.content.length;
122123
this._advance();
123124
}
@@ -133,16 +134,16 @@ class _HtmlTokenizer {
133134
while (this.peek !== $EOF) {
134135
var start = this._getLocation();
135136
try {
136-
if (this._attemptChar($LT)) {
137-
if (this._attemptChar($BANG)) {
138-
if (this._attemptChar($LBRACKET)) {
137+
if (this._attemptCharCode($LT)) {
138+
if (this._attemptCharCode($BANG)) {
139+
if (this._attemptCharCode($LBRACKET)) {
139140
this._consumeCdata(start);
140-
} else if (this._attemptChar($MINUS)) {
141+
} else if (this._attemptCharCode($MINUS)) {
141142
this._consumeComment(start);
142143
} else {
143144
this._consumeDocType(start);
144145
}
145-
} else if (this._attemptChar($SLASH)) {
146+
} else if (this._attemptCharCode($SLASH)) {
146147
this._consumeTagClose(start);
147148
} else {
148149
this._consumeTagOpen(start);
@@ -205,50 +206,66 @@ class _HtmlTokenizer {
205206
this.column++;
206207
}
207208
this.index++;
208-
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.inputLowercase,
209-
this.index);
209+
this.peek = this.index >= this.length ? $EOF : StringWrapper.charCodeAt(this.input, this.index);
210210
}
211211

212-
private _attemptChar(charCode: number): boolean {
212+
private _attemptCharCode(charCode: number): boolean {
213213
if (this.peek === charCode) {
214214
this._advance();
215215
return true;
216216
}
217217
return false;
218218
}
219219

220-
private _requireChar(charCode: number) {
220+
private _attemptCharCodeCaseInsensitive(charCode: number): boolean {
221+
if (compareCharCodeCaseInsensitive(this.peek, charCode)) {
222+
this._advance();
223+
return true;
224+
}
225+
return false;
226+
}
227+
228+
private _requireCharCode(charCode: number) {
221229
var location = this._getLocation();
222-
if (!this._attemptChar(charCode)) {
230+
if (!this._attemptCharCode(charCode)) {
223231
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
224232
}
225233
}
226234

227-
private _attemptChars(chars: string): boolean {
235+
private _attemptStr(chars: string): boolean {
228236
for (var i = 0; i < chars.length; i++) {
229-
if (!this._attemptChar(StringWrapper.charCodeAt(chars, i))) {
237+
if (!this._attemptCharCode(StringWrapper.charCodeAt(chars, i))) {
230238
return false;
231239
}
232240
}
233241
return true;
234242
}
235243

236-
private _requireChars(chars: string) {
244+
private _attemptStrCaseInsensitive(chars: string): boolean {
245+
for (var i = 0; i < chars.length; i++) {
246+
if (!this._attemptCharCodeCaseInsensitive(StringWrapper.charCodeAt(chars, i))) {
247+
return false;
248+
}
249+
}
250+
return true;
251+
}
252+
253+
private _requireStr(chars: string) {
237254
var location = this._getLocation();
238-
if (!this._attemptChars(chars)) {
255+
if (!this._attemptStr(chars)) {
239256
throw this._createError(unexpectedCharacterErrorMsg(this.peek), location);
240257
}
241258
}
242259

243-
private _attemptUntilFn(predicate: Function) {
260+
private _attemptCharCodeUntilFn(predicate: Function) {
244261
while (!predicate(this.peek)) {
245262
this._advance();
246263
}
247264
}
248265

249-
private _requireUntilFn(predicate: Function, len: number) {
266+
private _requireCharCodeUntilFn(predicate: Function, len: number) {
250267
var start = this._getLocation();
251-
this._attemptUntilFn(predicate);
268+
this._attemptCharCodeUntilFn(predicate);
252269
if (this.index - start.offset < len) {
253270
throw this._createError(unexpectedCharacterErrorMsg(this.peek), start);
254271
}
@@ -273,10 +290,10 @@ class _HtmlTokenizer {
273290
private _decodeEntity(): string {
274291
var start = this._getLocation();
275292
this._advance();
276-
if (this._attemptChar($HASH)) {
277-
let isHex = this._attemptChar($x);
293+
if (this._attemptCharCode($HASH)) {
294+
let isHex = this._attemptCharCode($x) || this._attemptCharCode($X);
278295
let numberStart = this._getLocation().offset;
279-
this._attemptUntilFn(isDigitEntityEnd);
296+
this._attemptCharCodeUntilFn(isDigitEntityEnd);
280297
if (this.peek != $SEMICOLON) {
281298
throw this._createError(unexpectedCharacterErrorMsg(this.peek), this._getLocation());
282299
}
@@ -291,7 +308,7 @@ class _HtmlTokenizer {
291308
}
292309
} else {
293310
let startPosition = this._savePosition();
294-
this._attemptUntilFn(isNamedEntityEnd);
311+
this._attemptCharCodeUntilFn(isNamedEntityEnd);
295312
if (this.peek != $SEMICOLON) {
296313
this._restorePosition(startPosition);
297314
return '&';
@@ -315,7 +332,7 @@ class _HtmlTokenizer {
315332
var parts = [];
316333
while (true) {
317334
tagCloseStart = this._getLocation();
318-
if (this._attemptChar(firstCharOfEnd) && attemptEndRest()) {
335+
if (this._attemptCharCode(firstCharOfEnd) && attemptEndRest()) {
319336
break;
320337
}
321338
if (this.index > tagCloseStart.offset) {
@@ -330,18 +347,18 @@ class _HtmlTokenizer {
330347

331348
private _consumeComment(start: ParseLocation) {
332349
this._beginToken(HtmlTokenType.COMMENT_START, start);
333-
this._requireChar($MINUS);
350+
this._requireCharCode($MINUS);
334351
this._endToken([]);
335-
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptChars('->'));
352+
var textToken = this._consumeRawText(false, $MINUS, () => this._attemptStr('->'));
336353
this._beginToken(HtmlTokenType.COMMENT_END, textToken.sourceSpan.end);
337354
this._endToken([]);
338355
}
339356

340357
private _consumeCdata(start: ParseLocation) {
341358
this._beginToken(HtmlTokenType.CDATA_START, start);
342-
this._requireChars('cdata[');
359+
this._requireStr('CDATA[');
343360
this._endToken([]);
344-
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptChars(']>'));
361+
var textToken = this._consumeRawText(false, $RBRACKET, () => this._attemptStr(']>'));
345362
this._beginToken(HtmlTokenType.CDATA_END, textToken.sourceSpan.end);
346363
this._endToken([]);
347364
}
@@ -367,7 +384,7 @@ class _HtmlTokenizer {
367384
} else {
368385
nameStart = nameOrPrefixStart;
369386
}
370-
this._requireUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
387+
this._requireCharCodeUntilFn(isNameEnd, this.index === nameStart ? 1 : 0);
371388
var name = this.input.substring(nameStart, this.index);
372389
return [prefix, name];
373390
}
@@ -381,16 +398,16 @@ class _HtmlTokenizer {
381398
}
382399
var nameStart = this.index;
383400
this._consumeTagOpenStart(start);
384-
lowercaseTagName = this.inputLowercase.substring(nameStart, this.index);
385-
this._attemptUntilFn(isNotWhitespace);
401+
lowercaseTagName = this.input.substring(nameStart, this.index).toLowerCase();
402+
this._attemptCharCodeUntilFn(isNotWhitespace);
386403
while (this.peek !== $SLASH && this.peek !== $GT) {
387404
this._consumeAttributeName();
388-
this._attemptUntilFn(isNotWhitespace);
389-
if (this._attemptChar($EQ)) {
390-
this._attemptUntilFn(isNotWhitespace);
405+
this._attemptCharCodeUntilFn(isNotWhitespace);
406+
if (this._attemptCharCode($EQ)) {
407+
this._attemptCharCodeUntilFn(isNotWhitespace);
391408
this._consumeAttributeValue();
392409
}
393-
this._attemptUntilFn(isNotWhitespace);
410+
this._attemptCharCodeUntilFn(isNotWhitespace);
394411
}
395412
this._consumeTagOpenEnd();
396413
} catch (e) {
@@ -416,11 +433,11 @@ class _HtmlTokenizer {
416433

417434
private _consumeRawTextWithTagClose(lowercaseTagName: string, decodeEntities: boolean) {
418435
var textToken = this._consumeRawText(decodeEntities, $LT, () => {
419-
if (!this._attemptChar($SLASH)) return false;
420-
this._attemptUntilFn(isNotWhitespace);
421-
if (!this._attemptChars(lowercaseTagName)) return false;
422-
this._attemptUntilFn(isNotWhitespace);
423-
if (!this._attemptChar($GT)) return false;
436+
if (!this._attemptCharCode($SLASH)) return false;
437+
this._attemptCharCodeUntilFn(isNotWhitespace);
438+
if (!this._attemptStrCaseInsensitive(lowercaseTagName)) return false;
439+
this._attemptCharCodeUntilFn(isNotWhitespace);
440+
if (!this._attemptCharCode($GT)) return false;
424441
return true;
425442
});
426443
this._beginToken(HtmlTokenType.TAG_CLOSE, textToken.sourceSpan.end);
@@ -453,27 +470,27 @@ class _HtmlTokenizer {
453470
this._advance();
454471
} else {
455472
var valueStart = this.index;
456-
this._requireUntilFn(isNameEnd, 1);
473+
this._requireCharCodeUntilFn(isNameEnd, 1);
457474
value = this.input.substring(valueStart, this.index);
458475
}
459476
this._endToken([this._processCarriageReturns(value)]);
460477
}
461478

462479
private _consumeTagOpenEnd() {
463-
var tokenType =
464-
this._attemptChar($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID : HtmlTokenType.TAG_OPEN_END;
480+
var tokenType = this._attemptCharCode($SLASH) ? HtmlTokenType.TAG_OPEN_END_VOID :
481+
HtmlTokenType.TAG_OPEN_END;
465482
this._beginToken(tokenType);
466-
this._requireChar($GT);
483+
this._requireCharCode($GT);
467484
this._endToken([]);
468485
}
469486

470487
private _consumeTagClose(start: ParseLocation) {
471488
this._beginToken(HtmlTokenType.TAG_CLOSE, start);
472-
this._attemptUntilFn(isNotWhitespace);
489+
this._attemptCharCodeUntilFn(isNotWhitespace);
473490
var prefixAndName;
474491
prefixAndName = this._consumePrefixAndName();
475-
this._attemptUntilFn(isNotWhitespace);
476-
this._requireChar($GT);
492+
this._attemptCharCodeUntilFn(isNotWhitespace);
493+
this._requireCharCode($GT);
477494
this._endToken(prefixAndName);
478495
}
479496

@@ -534,11 +551,19 @@ function isTextEnd(code: number): boolean {
534551
}
535552

536553
function isAsciiLetter(code: number): boolean {
537-
return code >= $a && code <= $z;
554+
return code >= $a && code <= $z || code >= $A && code <= $Z;
538555
}
539556

540557
function isAsciiHexDigit(code: number): boolean {
541-
return code >= $a && code <= $f || code >= $0 && code <= $9;
558+
return code >= $a && code <= $f || code >= $A && code <= $F || code >= $0 && code <= $9;
559+
}
560+
561+
function compareCharCodeCaseInsensitive(code1: number, code2: number): boolean {
562+
return toUpperCaseCharCode(code1) == toUpperCaseCharCode(code2);
563+
}
564+
565+
function toUpperCaseCharCode(code: number): number {
566+
return code >= $a && code <= $z ? code - $a + $A : code;
542567
}
543568

544569
function mergeTextTokens(srcTokens: HtmlToken[]): HtmlToken[] {

modules/angular2/test/compiler/html_lexer_spec.ts

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,9 @@ export function main() {
114114
});
115115
});
116116

117-
describe('cdata', () => {
118-
it('should parse cdata', () => {
119-
expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>'))
117+
describe('CDATA', () => {
118+
it('should parse CDATA', () => {
119+
expect(tokenizeAndHumanizeParts('<![CDATA[t\ne\rs\r\nt]]>'))
120120
.toEqual([
121121
[HtmlTokenType.CDATA_START],
122122
[HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
@@ -126,22 +126,22 @@ export function main() {
126126
});
127127

128128
it('should store the locations', () => {
129-
expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>'))
129+
expect(tokenizeAndHumanizeSourceSpans('<![CDATA[t\ne\rs\r\nt]]>'))
130130
.toEqual([
131-
[HtmlTokenType.CDATA_START, '<![cdata['],
131+
[HtmlTokenType.CDATA_START, '<![CDATA['],
132132
[HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
133133
[HtmlTokenType.CDATA_END, ']]>'],
134134
[HtmlTokenType.EOF, '']
135135
]);
136136
});
137137

138-
it('should report <![ without cdata[', () => {
138+
it('should report <![ without CDATA[', () => {
139139
expect(tokenizeAndHumanizeErrors('<![a'))
140140
.toEqual([[HtmlTokenType.CDATA_START, 'Unexpected character "a"', '0:3']]);
141141
});
142142

143143
it('should report missing end cdata', () => {
144-
expect(tokenizeAndHumanizeErrors('<![cdata['))
144+
expect(tokenizeAndHumanizeErrors('<![CDATA['))
145145
.toEqual([[HtmlTokenType.RAW_TEXT, 'Unexpected character "EOF"', '0:9']]);
146146
});
147147
});
@@ -367,8 +367,8 @@ export function main() {
367367
});
368368

369369
it('should parse hexadecimal entities', () => {
370-
expect(tokenizeAndHumanizeParts('&#x41;'))
371-
.toEqual([[HtmlTokenType.TEXT, 'A'], [HtmlTokenType.EOF]]);
370+
expect(tokenizeAndHumanizeParts('&#x41;&#X41;'))
371+
.toEqual([[HtmlTokenType.TEXT, 'AA'], [HtmlTokenType.EOF]]);
372372
});
373373

374374
it('should parse decimal entities', () => {
@@ -473,7 +473,7 @@ export function main() {
473473
});
474474

475475
it('should not detect entities', () => {
476-
expect(tokenizeAndHumanizeParts(`<script>&amp;</script>`))
476+
expect(tokenizeAndHumanizeParts(`<script>&amp;</SCRIPT>`))
477477
.toEqual([
478478
[HtmlTokenType.TAG_OPEN_START, null, 'script'],
479479
[HtmlTokenType.TAG_OPEN_END],
@@ -587,6 +587,19 @@ export function main() {
587587
});
588588
});
589589

590+
describe('unicode characters', () => {
591+
it('should support unicode characters', () => {
592+
expect(tokenizeAndHumanizeSourceSpans(`<p>İ</p>`))
593+
.toEqual([
594+
[HtmlTokenType.TAG_OPEN_START, '<p'],
595+
[HtmlTokenType.TAG_OPEN_END, '>'],
596+
[HtmlTokenType.TEXT, 'İ'],
597+
[HtmlTokenType.TAG_CLOSE, '</p>'],
598+
[HtmlTokenType.EOF, '']
599+
]);
600+
});
601+
});
602+
590603
});
591604
}
592605

0 commit comments

Comments
 (0)
X Tutup