fix(HtmlLexer): handle CR in input stream per HTML spec

vicb · vicb · commit 9850e6870389 · 2015-12-08T02:18:20.000Z
fixes #5618 Closes #5629
diff --git a/modules/angular2/src/compiler/html_lexer.ts b/modules/angular2/src/compiler/html_lexer.ts
@@ -83,6 +83,9 @@ const $x = 120;
 
 const $NBSP = 160;
 
+var CRLF_REGEXP = /\r\n/g;
+var CR_REGEXP = /\r/g;
+
 function unexpectedCharacterErrorMsg(charCode: number): string {
   var char = charCode === $EOF ? 'EOF' : StringWrapper.fromCharCode(charCode);
   return `Unexpected character "${char}"`;
@@ -119,6 +122,14 @@ class _HtmlTokenizer {
     this._advance();
   }
 
+  private _processCarriageReturns(content: string): string {
+    // http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+    // In order to keep the original position in the source, we can not pre-process it.
+    // Instead CRs are processed right before instantiating the tokens.
+    content = StringWrapper.replaceAll(content, CRLF_REGEXP, '\r');
+    return StringWrapper.replaceAll(content, CR_REGEXP, '\n');
+  }
+
   tokenize(): HtmlTokenizeResult {
     while (this.peek !== $EOF) {
       var start = this._getLocation();
@@ -315,7 +326,7 @@ class _HtmlTokenizer {
         parts.push(this._readChar(decodeEntities));
       }
     }
-    return this._endToken([parts.join('')], tagCloseStart);
+    return this._endToken([this._processCarriageReturns(parts.join(''))], tagCloseStart);
   }
 
   private _consumeComment(start: ParseLocation) {
@@ -428,7 +439,7 @@ class _HtmlTokenizer {
       this._requireUntilFn(isNameEnd, 1);
       value = this.input.substring(valueStart, this.index);
     }
-    this._endToken([value]);
+    this._endToken([this._processCarriageReturns(value)]);
   }
 
   private _consumeTagOpenEnd() {
@@ -456,7 +467,7 @@ class _HtmlTokenizer {
     while (!isTextEnd(this.peek)) {
       parts.push(this._readChar(true));
     }
-    this._endToken([parts.join('')]);
+    this._endToken([this._processCarriageReturns(parts.join(''))]);
   }
 
   private _savePosition(): number[] { return [this.peek, this.index, this.column, this.line]; }
diff --git a/modules/angular2/test/compiler/html_lexer_spec.ts b/modules/angular2/test/compiler/html_lexer_spec.ts
@@ -53,26 +53,38 @@ export function main() {
               [HtmlTokenType.EOF, '2:5']
             ]);
       });
+
+      it('should work with CR and LF', () => {
+        expect(tokenizeAndHumanizeLineColumn('<t\n>\r\na\r</t>'))
+            .toEqual([
+              [HtmlTokenType.TAG_OPEN_START, '0:0'],
+              [HtmlTokenType.TAG_OPEN_END, '1:0'],
+              [HtmlTokenType.TEXT, '1:1'],
+              [HtmlTokenType.TAG_CLOSE, '2:1'],
+              [HtmlTokenType.EOF, '2:5']
+            ]);
+      });
     });
 
     describe('comments', () => {
       it('should parse comments', () => {
-        expect(tokenizeAndHumanizeParts('<!--test-->'))
+        expect(tokenizeAndHumanizeParts('<!--t\ne\rs\r\nt-->'))
             .toEqual([
               [HtmlTokenType.COMMENT_START],
-              [HtmlTokenType.RAW_TEXT, 'test'],
+              [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
               [HtmlTokenType.COMMENT_END],
               [HtmlTokenType.EOF]
             ]);
       });
 
-      it('should store the locations', () => {expect(tokenizeAndHumanizeSourceSpans('<!--test-->'))
-                                                  .toEqual([
-                                                    [HtmlTokenType.COMMENT_START, '<!--'],
-                                                    [HtmlTokenType.RAW_TEXT, 'test'],
-                                                    [HtmlTokenType.COMMENT_END, '-->'],
-                                                    [HtmlTokenType.EOF, '']
-                                                  ])});
+      it('should store the locations',
+         () => {expect(tokenizeAndHumanizeSourceSpans('<!--t\ne\rs\r\nt-->'))
+                    .toEqual([
+                      [HtmlTokenType.COMMENT_START, '<!--'],
+                      [HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
+                      [HtmlTokenType.COMMENT_END, '-->'],
+                      [HtmlTokenType.EOF, '']
+                    ])});
 
       it('should report <!- without -', () => {
         expect(tokenizeAndHumanizeErrors('<!-a'))
@@ -104,20 +116,20 @@ export function main() {
 
     describe('cdata', () => {
       it('should parse cdata', () => {
-        expect(tokenizeAndHumanizeParts('<![cdata[test]]>'))
+        expect(tokenizeAndHumanizeParts('<![cdata[t\ne\rs\r\nt]]>'))
             .toEqual([
               [HtmlTokenType.CDATA_START],
-              [HtmlTokenType.RAW_TEXT, 'test'],
+              [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
               [HtmlTokenType.CDATA_END],
               [HtmlTokenType.EOF]
             ]);
       });
 
       it('should store the locations', () => {
-        expect(tokenizeAndHumanizeSourceSpans('<![cdata[test]]>'))
+        expect(tokenizeAndHumanizeSourceSpans('<![cdata[t\ne\rs\r\nt]]>'))
             .toEqual([
               [HtmlTokenType.CDATA_START, '<![cdata['],
-              [HtmlTokenType.RAW_TEXT, 'test'],
+              [HtmlTokenType.RAW_TEXT, 't\ne\rs\r\nt'],
               [HtmlTokenType.CDATA_END, ']]>'],
               [HtmlTokenType.EOF, '']
             ]);
@@ -301,6 +313,17 @@ export function main() {
             ]);
       });
 
+      it('should parse values with CR and LF', () => {
+        expect(tokenizeAndHumanizeParts("<t a='t\ne\rs\r\nt'>"))
+            .toEqual([
+              [HtmlTokenType.TAG_OPEN_START, null, 't'],
+              [HtmlTokenType.ATTR_NAME, null, 'a'],
+              [HtmlTokenType.ATTR_VALUE, 't\ne\ns\nt'],
+              [HtmlTokenType.TAG_OPEN_END],
+              [HtmlTokenType.EOF]
+            ]);
+      });
+
       it('should store the locations', () => {
         expect(tokenizeAndHumanizeSourceSpans('<t a=b>'))
             .toEqual([
@@ -406,6 +429,11 @@ export function main() {
             .toEqual([[HtmlTokenType.TEXT, 'a'], [HtmlTokenType.EOF]]);
       });
 
+      it('should handle CR & LF', () => {
+        expect(tokenizeAndHumanizeParts('t\ne\rs\r\nt'))
+            .toEqual([[HtmlTokenType.TEXT, 't\ne\ns\nt'], [HtmlTokenType.EOF]]);
+      });
+
       it('should parse entities', () => {
         expect(tokenizeAndHumanizeParts('a&amp;b'))
             .toEqual([[HtmlTokenType.TEXT, 'a&b'], [HtmlTokenType.EOF]]);
@@ -424,11 +452,11 @@ export function main() {
 
     describe('raw text', () => {
       it('should parse text', () => {
-        expect(tokenizeAndHumanizeParts(`<script>a</script>`))
+        expect(tokenizeAndHumanizeParts(`<script>t\ne\rs\r\nt</script>`))
             .toEqual([
               [HtmlTokenType.TAG_OPEN_START, null, 'script'],
               [HtmlTokenType.TAG_OPEN_END],
-              [HtmlTokenType.RAW_TEXT, 'a'],
+              [HtmlTokenType.RAW_TEXT, 't\ne\ns\nt'],
               [HtmlTokenType.TAG_CLOSE, null, 'script'],
               [HtmlTokenType.EOF]
             ]);
@@ -482,11 +510,11 @@ export function main() {
 
     describe('escapable raw text', () => {
       it('should parse text', () => {
-        expect(tokenizeAndHumanizeParts(`<title>a</title>`))
+        expect(tokenizeAndHumanizeParts(`<title>t\ne\rs\r\nt</title>`))
             .toEqual([
               [HtmlTokenType.TAG_OPEN_START, null, 'title'],
               [HtmlTokenType.TAG_OPEN_END],
-              [HtmlTokenType.ESCAPABLE_RAW_TEXT, 'a'],
+              [HtmlTokenType.ESCAPABLE_RAW_TEXT, 't\ne\ns\nt'],
               [HtmlTokenType.TAG_CLOSE, null, 'title'],
               [HtmlTokenType.EOF]
             ]);

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,9 @@ const $x = 120;`
`83`	`83`
`84`	`84`	`const $NBSP = 160;`
`85`	`85`
	`86`	`+var CRLF_REGEXP = /\r\n/g;`
	`87`	`+var CR_REGEXP = /\r/g;`
	`88`	`+`
`86`	`89`	`function unexpectedCharacterErrorMsg(charCode: number): string {`
`87`	`90`	`var char = charCode === $EOF ? 'EOF' : StringWrapper.fromCharCode(charCode);`
`88`	`91`	return `Unexpected character "${char}"`;
`@@ -119,6 +122,14 @@ class _HtmlTokenizer {`
`119`	`122`	`this._advance();`
`120`	`123`	`}`
`121`	`124`
	`125`	`+ private _processCarriageReturns(content: string): string {`
	`126`	`+ // http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream`
	`127`	`+ // In order to keep the original position in the source, we can not pre-process it.`
	`128`	`+ // Instead CRs are processed right before instantiating the tokens.`
	`129`	`+ content = StringWrapper.replaceAll(content, CRLF_REGEXP, '\r');`
	`130`	`+ return StringWrapper.replaceAll(content, CR_REGEXP, '\n');`
	`131`	`+ }`
	`132`	`+`
`122`	`133`	`tokenize(): HtmlTokenizeResult {`
`123`	`134`	`while (this.peek !== $EOF) {`
`124`	`135`	`var start = this._getLocation();`
`@@ -315,7 +326,7 @@ class _HtmlTokenizer {`
`315`	`326`	`parts.push(this._readChar(decodeEntities));`
`316`	`327`	`}`
`317`	`328`	`}`
`318`		`- return this._endToken([parts.join('')], tagCloseStart);`
	`329`	`+ return this._endToken([this._processCarriageReturns(parts.join(''))], tagCloseStart);`
`319`	`330`	`}`
`320`	`331`
`321`	`332`	`private _consumeComment(start: ParseLocation) {`
`@@ -428,7 +439,7 @@ class _HtmlTokenizer {`
`428`	`439`	`this._requireUntilFn(isNameEnd, 1);`
`429`	`440`	`value = this.input.substring(valueStart, this.index);`
`430`	`441`	`}`
`431`		`- this._endToken([value]);`
	`442`	`+ this._endToken([this._processCarriageReturns(value)]);`
`432`	`443`	`}`
`433`	`444`
`434`	`445`	`private _consumeTagOpenEnd() {`
`@@ -456,7 +467,7 @@ class _HtmlTokenizer {`
`456`	`467`	`while (!isTextEnd(this.peek)) {`
`457`	`468`	`parts.push(this._readChar(true));`
`458`	`469`	`}`
`459`		`- this._endToken([parts.join('')]);`
	`470`	`+ this._endToken([this._processCarriageReturns(parts.join(''))]);`
`460`	`471`	`}`
`461`	`472`
`462`	`473`	`private _savePosition(): number[] { return [this.peek, this.index, this.column, this.line]; }`