Skip to content

Commit c89f87f

Browse files
Detect malformed UTF-8 files and refuse to engage further (#53667)
1 parent d23b7e7 commit c89f87f

File tree

11 files changed

+817
-775
lines changed

11 files changed

+817
-775
lines changed

src/compiler/diagnosticMessages.json

+4
Original file line numberDiff line numberDiff line change
@@ -1605,6 +1605,10 @@
16051605
"category": "Error",
16061606
"code": 1489
16071607
},
1608+
"File appears to be binary.": {
1609+
"category": "Error",
1610+
"code": 1490
1611+
},
16081612

16091613
"The types of '{0}' are incompatible between these types.": {
16101614
"category": "Error",

src/compiler/program.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@ export function formatDiagnosticsWithColorAndContext(diagnostics: readonly Diagn
779779
output += formatColorAndReset(` TS${diagnostic.code}: `, ForegroundColorEscapeSequences.Grey);
780780
output += flattenDiagnosticMessageText(diagnostic.messageText, host.getNewLine());
781781

782-
if (diagnostic.file) {
782+
if (diagnostic.file && diagnostic.code !== Diagnostics.File_appears_to_be_binary.code) {
783783
output += host.getNewLine();
784784
output += formatCodeSpan(diagnostic.file, diagnostic.start!, diagnostic.length!, "", getCategoryFormat(diagnostic.category), host); // TODO: GH#18217
785785
}

src/compiler/scanner.ts

+20-8
Original file line numberDiff line numberDiff line change
@@ -1782,16 +1782,28 @@ export function createScanner(languageVersion: ScriptTarget,
17821782
if (pos >= end) {
17831783
return token = SyntaxKind.EndOfFileToken;
17841784
}
1785-
const ch = codePointAt(text, pos);
17861785

1787-
// Special handling for shebang
1788-
if (ch === CharacterCodes.hash && pos === 0 && isShebangTrivia(text, pos)) {
1789-
pos = scanShebangTrivia(text, pos);
1790-
if (skipTrivia) {
1791-
continue;
1786+
const ch = codePointAt(text, pos);
1787+
if (pos === 0) {
1788+
// If a file wasn't valid text at all, it will usually be apparent at
1789+
// position 0 because UTF-8 decode will fail and produce U+FFFD.
1790+
// If that happens, just issue one error and refuse to try to scan further;
1791+
// this is likely a binary file that cannot be parsed
1792+
if (ch === CharacterCodes.replacementCharacter) {
1793+
// Jump to the end of the file and fail.
1794+
error(Diagnostics.File_appears_to_be_binary);
1795+
pos = end;
1796+
return token = SyntaxKind.NonTextFileMarkerTrivia;
17921797
}
1793-
else {
1794-
return token = SyntaxKind.ShebangTrivia;
1798+
// Special handling for shebang
1799+
if (ch === CharacterCodes.hash && isShebangTrivia(text, pos)) {
1800+
pos = scanShebangTrivia(text, pos);
1801+
if (skipTrivia) {
1802+
continue;
1803+
}
1804+
else {
1805+
return token = SyntaxKind.ShebangTrivia;
1806+
}
17951807
}
17961808
}
17971809

src/compiler/types.ts

+6
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ export const enum SyntaxKind {
4949
// We detect and provide better error recovery when we encounter a git merge marker. This
5050
// allows us to edit files with git-conflict markers in them in a much more pleasant manner.
5151
ConflictMarkerTrivia,
52+
// If a file is actually binary, with any luck, we'll get U+FFFD REPLACEMENT CHARACTER
53+
// in position zero and can just skip what is surely a doomed parse.
54+
NonTextFileMarkerTrivia,
5255
// Literals
5356
NumericLiteral,
5457
BigIntLiteral,
@@ -7469,6 +7472,9 @@ export const enum CharacterCodes {
74697472
mathematicalSpace = 0x205F,
74707473
ogham = 0x1680,
74717474

7475+
// Unicode replacement character produced when a byte sequence is invalid
7476+
replacementCharacter = 0xFFFD,
7477+
74727478
_ = 0x5F,
74737479
$ = 0x24,
74747480

0 commit comments

Comments
 (0)