Skip to content

Replace ASCII control chars with Unicode Control Pictures #127528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Replace ASCII control chars with Unicode Control Pictures
```
error: bare CR not allowed in doc-comment
  --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:3:32
   |
LL | /// doc comment with bare CR: '␍'
   |                                ^
```
  • Loading branch information
estebank committed Jul 18, 2024
commit 89f273f40dafb693139496ed6f914872b6533fa6
69 changes: 54 additions & 15 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,10 +677,7 @@ impl HumanEmitter {
.skip(left)
.take_while(|ch| {
// Make sure that the trimming on the right will fall within the terminal width.
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
// is. For now, just accept that sometimes the code line will be longer than
// desired.
let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
let next = char_width(*ch);
if taken + next > right - left {
return false;
}
Expand Down Expand Up @@ -742,11 +739,7 @@ impl HumanEmitter {
let left = margin.left(source_string.len());

// Account for unicode characters of width !=0 that were removed.
let left = source_string
.chars()
.take(left)
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.sum();
let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();

self.draw_line(
buffer,
Expand Down Expand Up @@ -2039,7 +2032,7 @@ impl HumanEmitter {
let sub_len: usize =
if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
.chars()
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.map(|ch| char_width(ch))
.sum();

let offset: isize = offsets
Expand Down Expand Up @@ -2076,11 +2069,8 @@ impl HumanEmitter {
}

// length of the code after substitution
let full_sub_len = part
.snippet
.chars()
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.sum::<usize>() as isize;
let full_sub_len =
part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;

// length of the code to be substituted
let snippet_len = span_end_pos as isize - span_start_pos as isize;
Expand Down Expand Up @@ -2580,6 +2570,40 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\u{2068}', ""),
('\u{202C}', ""),
('\u{2069}', ""),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\u{0000}', "␀"),
('\u{0001}', "␁"),
('\u{0002}', "␂"),
('\u{0003}', "␃"),
('\u{0004}', "␄"),
('\u{0005}', "␅"),
('\u{0006}', "␆"),
('\u{0007}', "␇"),
('\u{0008}', "␈"),
('\u{000B}', "␋"),
('\u{000C}', "␌"),
('\u{000D}', "␍"),
('\u{000E}', "␎"),
('\u{000F}', "␏"),
('\u{0010}', "␐"),
('\u{0011}', "␑"),
('\u{0012}', "␒"),
('\u{0013}', "␓"),
('\u{0014}', "␔"),
('\u{0015}', "␕"),
('\u{0016}', "␖"),
('\u{0017}', "␗"),
('\u{0018}', "␘"),
('\u{0019}', "␙"),
('\u{001A}', "␚"),
('\u{001B}', "␛"),
('\u{001C}', "␜"),
('\u{001D}', "␝"),
('\u{001E}', "␞"),
('\u{001F}', "␟"),
('\u{007F}', "␡"),
];

fn normalize_whitespace(str: &str) -> String {
Expand All @@ -2590,6 +2614,21 @@ fn normalize_whitespace(str: &str) -> String {
s
}

fn char_width(ch: char) -> usize {
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char` is. For now,
// just accept that sometimes the code line will be longer than desired.
match ch {
'\t' => 4,
'\u{0000}' | '\u{0001}' | '\u{0002}' | '\u{0003}' | '\u{0004}' | '\u{0005}'
| '\u{0006}' | '\u{0007}' | '\u{0008}' | '\u{000B}' | '\u{000C}' | '\u{000D}'
| '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}'
| '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}'
| '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}'
| '\u{007F}' => 1,
_ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1),
}
}

fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {
buffer.puts(line, col, "| ", Style::LineNumber);
}
Expand Down
14 changes: 7 additions & 7 deletions tests/ui/lexer/lex-bare-cr-string-literal-doc-comment.stderr
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
error: bare CR not allowed in doc-comment
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:3:32
|
LL | /// doc comment with bare CR: ''
LL | /// doc comment with bare CR: ''
| ^

error: bare CR not allowed in block doc-comment
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:7:38
|
LL | /** block doc comment with bare CR: '' */
LL | /** block doc comment with bare CR: '' */
| ^

error: bare CR not allowed in doc-comment
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:12:36
|
LL | //! doc comment with bare CR: ''
LL | //! doc comment with bare CR: ''
| ^

error: bare CR not allowed in block doc-comment
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:15:42
|
LL | /*! block doc comment with bare CR: '' */
LL | /*! block doc comment with bare CR: '' */
| ^

error: bare CR not allowed in string, use `\r` instead
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:19:18
|
LL | let _s = "foobar";
LL | let _s = "foobar";
| ^
|
help: escape the character
Expand All @@ -36,13 +36,13 @@ LL | let _s = "foo\rbar";
error: bare CR not allowed in raw string
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:22:19
|
LL | let _s = r"barfoo";
LL | let _s = r"barfoo";
| ^

error: unknown character escape: `\r`
--> $DIR/lex-bare-cr-string-literal-doc-comment.rs:25:19
|
LL | let _s = "foo\bar";
LL | let _s = "foo\bar";
| ^ unknown character escape
|
= help: this is an isolated carriage return; consider checking your editor and version control settings
Expand Down
Binary file modified tests/ui/parser/bad-char-literals.rs
Binary file not shown.
17 changes: 14 additions & 3 deletions tests/ui/parser/bad-char-literals.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,27 @@ LL | '\n';
error: character constant must be escaped: `\r`
--> $DIR/bad-char-literals.rs:15:6
|
LL | '';
LL | '';
| ^
|
help: escape the character
|
LL | '\r';
| ++

error: character literal may only contain one codepoint
--> $DIR/bad-char-literals.rs:18:5
|
LL | '-␀-';
| ^^^^
|
help: if you meant to write a string literal, use double quotes
|
LL | "-␀-";
| ~ ~

error: character constant must be escaped: `\t`
--> $DIR/bad-char-literals.rs:18:6
--> $DIR/bad-char-literals.rs:21:6
|
LL | ' ';
| ^^^^
Expand All @@ -44,5 +55,5 @@ help: escape the character
LL | '\t';
| ++

error: aborting due to 4 previous errors
error: aborting due to 5 previous errors

Binary file modified tests/ui/parser/issues/issue-66473.stderr
Binary file not shown.
Binary file modified tests/ui/parser/issues/issue-68629.stderr
Binary file not shown.
Binary file modified tests/ui/parser/issues/issue-68730.stderr
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/ui/parser/raw/raw-byte-string-literals.stderr
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
error: bare CR not allowed in raw string
--> $DIR/raw-byte-string-literals.rs:4:9
|
LL | br"a";
LL | br"a";
| ^

error: non-ASCII character in raw byte string literal
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
error: bare CR not allowed in doc-comment
--> $DIR/several-carriage-returns-in-doc-comment.rs:6:12
|
LL | /// This doc comment contains three isolated `\r` symbols
LL | /// This doc comment contains three isolated `\r` symbols
| ^

error: bare CR not allowed in doc-comment
--> $DIR/several-carriage-returns-in-doc-comment.rs:6:32
|
LL | /// This doc comment contains three isolated `\r` symbols
LL | /// This doc comment contains three isolated `\r` symbols
| ^

error: bare CR not allowed in doc-comment
--> $DIR/several-carriage-returns-in-doc-comment.rs:6:52
|
LL | /// This doc comment contains three isolated `\r` symbols
LL | /// This doc comment contains three isolated `\r` symbols
| ^

error: aborting due to 3 previous errors
Expand Down
2 changes: 1 addition & 1 deletion tests/ui/parser/trailing-carriage-return-in-string.stderr
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
error: unknown character escape: `\r`
--> $DIR/trailing-carriage-return-in-string.rs:10:25
|
LL | let bad = "This is \ a test";
LL | let bad = "This is \ a test";
| ^ unknown character escape
Comment on lines 1 to 5
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is confusing, because the problem is that \ escape is followed by a non-printable character. This is parsed as \␍, which is kind-of \\r.

|
= help: this is an isolated carriage return; consider checking your editor and version control settings
Expand Down
Binary file modified tests/ui/parser/utf16-be-without-bom.stderr
Binary file not shown.
Binary file modified tests/ui/parser/utf16-le-without-bom.stderr
Binary file not shown.
Binary file modified tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/ui/str/str-escape.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ warning: whitespace symbol '\u{c}' is not skipped
|
LL | let s = b"a\
| ________________^
LL | | b";
LL | | b";
| | ^- whitespace symbol '\u{c}' is not skipped
| |____|
|
Expand Down