Skip to content

Commit 24a3c01

Browse files
Use appropriate charset in body_string() (http-rs#108)
* Sketch encoding-aware body_string() implementation * Add some tests for the encodings * Fix ownership of byte array in utf-8-only path * make wasm32 impl of decode_body compile * Error if "encoding" feature is disabled and body is not utf-8 * custom Debug impl for DecodeError to truncate the `data` bytes * document encoding behaviour of body_string()
1 parent 2540aea commit 24a3c01

File tree

3 files changed

+206
-5
lines changed

3 files changed

+206
-5
lines changed

Cargo.toml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@ readme = "README.md"
1212
edition = "2018"
1313

1414
[features]
15-
default = ["native-client", "middleware-logger"]
15+
default = ["native-client", "middleware-logger", "encoding"]
1616
native-client = ["curl-client", "wasm-client"]
1717
hyper-client = ["hyper", "runtime", "runtime-raw", "runtime-tokio" ]
1818
curl-client = ["isahc"]
1919
wasm-client = ["js-sys", "web-sys", "wasm-bindgen", "wasm-bindgen-futures"]
2020
middleware-logger = []
21+
encoding = ["encoding_rs"]
2122

2223
[dependencies]
2324
futures-preview = { version = "0.3.0-alpha.19", features = ["compat", "io-compat"] }
@@ -30,8 +31,11 @@ serde_json = "1.0.40"
3031
serde_urlencoded = "0.6.1"
3132
url = "2.0.0"
3233

33-
# isahc-client
3434
[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
35+
# encoding
36+
encoding_rs = { version = "0.8.20", optional = true }
37+
38+
# isahc-client
3539
isahc = { version = "0.7", optional = true, default-features = false, features = ["http2"] }
3640

3741
# hyper-client
@@ -63,6 +67,7 @@ features = [
6367
"RequestMode",
6468
"RequestRedirect",
6569
"Response",
70+
"TextDecoder",
6671
"Window",
6772
]
6873

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ pub use url;
8888

8989
pub use client::Client;
9090
pub use request::Request;
91-
pub use response::Response;
91+
pub use response::{DecodeError, Response};
9292

9393
#[cfg(feature = "native-client")]
9494
mod one_off;

src/response.rs

Lines changed: 198 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,21 @@ impl Response {
144144
/// This method can be called after the body has already been read, but will
145145
/// produce an empty buffer.
146146
///
147+
/// # Encodings
148+
///
149+
/// If the "encoding" feature is enabled, this method tries to decode the body
150+
/// with the encoding that is specified in the Content-Type header. If the header
151+
/// does not specify an encoding, UTF-8 is assumed. If the "encoding" feature is
152+
/// disabled, Surf only supports reading UTF-8 response bodies. The "encoding"
153+
/// feature is enabled by default.
154+
///
147155
/// # Errors
148156
///
149157
/// Any I/O error encountered while reading the body is immediately returned
150158
/// as an `Err`.
151159
///
152-
/// If the body cannot be interpreted as valid UTF-8, an `Err` is returned.
160+
/// If the body cannot be interpreted because the encoding is unsupported or
161+
/// incorrect, an `Err` is returned.
153162
///
154163
/// # Examples
155164
///
@@ -162,7 +171,12 @@ impl Response {
162171
/// ```
163172
pub async fn body_string(&mut self) -> Result<String, Exception> {
164173
let bytes = self.body_bytes().await?;
165-
Ok(String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?)
174+
let mime = self.mime();
175+
let claimed_encoding = mime
176+
.as_ref()
177+
.and_then(|mime| mime.get_param("charset"))
178+
.map(|name| name.as_str());
179+
decode_body(bytes, claimed_encoding)
166180
}
167181

168182
/// Reads and deserialized the entire request body from json.
@@ -246,3 +260,185 @@ impl fmt::Debug for Response {
246260
.finish()
247261
}
248262
}
263+
264+
/// An error occurred while decoding a response body to a string.
265+
///
266+
/// The error carries the encoding that was used to attempt to decode the body, and the raw byte
267+
/// contents of the body. This can be used to treat un-decodable bodies specially or to implement a
268+
/// fallback parsing strategy.
269+
#[derive(Clone)]
270+
pub struct DecodeError {
271+
/// The name of the encoding that was used to try to decode the input.
272+
pub encoding: String,
273+
/// The input data as bytes.
274+
pub data: Vec<u8>,
275+
}
276+
277+
// Override debug output so you don't get each individual byte in `data` printed out separately,
278+
// because it can be many megabytes large. The actual content is not that interesting anyways
279+
// and can be accessed manually if it is required.
280+
impl fmt::Debug for DecodeError {
281+
#[allow(missing_doc_code_examples)]
282+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283+
f.debug_struct("DecodeError")
284+
.field("encoding", &self.encoding)
285+
// Perhaps we can output the first N bytes of the response in the future
286+
.field("data", &format!("{} bytes", self.data.len()))
287+
.finish()
288+
}
289+
}
290+
291+
impl fmt::Display for DecodeError {
292+
#[allow(missing_doc_code_examples)]
293+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
294+
write!(f, "could not decode body as {}", &self.encoding)
295+
}
296+
}
297+
298+
impl std::error::Error for DecodeError {}
299+
300+
/// Check if an encoding label refers to the UTF-8 encoding.
301+
#[allow(dead_code)]
302+
fn is_utf8_encoding(encoding_label: &str) -> bool {
303+
encoding_label.eq_ignore_ascii_case("utf-8")
304+
|| encoding_label.eq_ignore_ascii_case("utf8")
305+
|| encoding_label.eq_ignore_ascii_case("unicode-1-1-utf-8")
306+
}
307+
308+
/// Decode a response body as utf-8.
309+
///
310+
/// # Errors
311+
///
312+
/// If the body cannot be decoded as utf-8, this function returns an `std::io::Error` of kind
313+
/// `std::io::ErrorKind::InvalidData`, carrying a `DecodeError` struct.
314+
#[cfg(not(feature = "encoding"))]
315+
fn decode_body(bytes: Vec<u8>, content_encoding: Option<&str>) -> Result<String, Exception> {
316+
if is_utf8_encoding(content_encoding.unwrap_or("utf-8")) {
317+
Ok(String::from_utf8(bytes).map_err(|err| {
318+
let err = DecodeError {
319+
encoding: "utf-8".to_string(),
320+
data: err.into_bytes(),
321+
};
322+
io::Error::new(io::ErrorKind::InvalidData, err)
323+
})?)
324+
} else {
325+
let err = DecodeError {
326+
encoding: "utf-8".to_string(),
327+
data: bytes,
328+
};
329+
Err(io::Error::new(io::ErrorKind::InvalidData, err).into())
330+
}
331+
}
332+
333+
/// Decode a response body as the given content type.
334+
///
335+
/// If the input bytes are valid utf-8, this does not make a copy.
336+
///
337+
/// # Errors
338+
///
339+
/// If an unsupported encoding is requested, or the body does not conform to the requested
340+
/// encoding, this function returns an `std::io::Error` of kind `std::io::ErrorKind::InvalidData`,
341+
/// carrying a `DecodeError` struct.
342+
#[cfg(all(feature = "encoding", not(target_arch = "wasm32")))]
343+
fn decode_body(bytes: Vec<u8>, content_encoding: Option<&str>) -> Result<String, Exception> {
344+
use encoding_rs::Encoding;
345+
use std::borrow::Cow;
346+
347+
let content_encoding = content_encoding.unwrap_or("utf-8");
348+
if let Some(encoding) = Encoding::for_label(content_encoding.as_bytes()) {
349+
let (decoded, encoding_used, failed) = encoding.decode(&bytes);
350+
if failed {
351+
let err = DecodeError {
352+
encoding: encoding_used.name().into(),
353+
data: bytes,
354+
};
355+
Err(io::Error::new(io::ErrorKind::InvalidData, err))?
356+
} else {
357+
Ok(match decoded {
358+
// If encoding_rs returned a `Cow::Borrowed`, the bytes are guaranteed to be valid
359+
// UTF-8, by virtue of being UTF-8 or being in the subset of ASCII that is the same
360+
// in UTF-8.
361+
Cow::Borrowed(_) => unsafe { String::from_utf8_unchecked(bytes) },
362+
Cow::Owned(string) => string,
363+
})
364+
}
365+
} else {
366+
let err = DecodeError {
367+
encoding: content_encoding.to_string(),
368+
data: bytes,
369+
};
370+
Err(io::Error::new(io::ErrorKind::InvalidData, err))?
371+
}
372+
}
373+
374+
/// Decode a response body as the given content type.
375+
///
376+
/// This always makes a copy. (It could be optimized to avoid the copy if the encoding is utf-8.)
377+
///
378+
/// # Errors
379+
///
380+
/// If an unsupported encoding is requested, or the body does not conform to the requested
381+
/// encoding, this function returns an `std::io::Error` of kind `std::io::ErrorKind::InvalidData`,
382+
/// carrying a `DecodeError` struct.
383+
#[cfg(all(feature = "encoding", target_arch = "wasm32"))]
384+
fn decode_body(mut bytes: Vec<u8>, content_encoding: Option<&str>) -> Result<String, Exception> {
385+
use web_sys::TextDecoder;
386+
387+
// Encoding names are always valid ASCII, so we can avoid including casing mapping tables
388+
let content_encoding = content_encoding.unwrap_or("utf-8").to_ascii_lowercase();
389+
if is_utf8_encoding(content_encoding) {
390+
return String::from_utf8(bytes)
391+
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err).into());
392+
}
393+
394+
let decoder = TextDecoder::new_with_label(&content_encoding).unwrap();
395+
396+
Ok(decoder.decode_with_u8_array(&mut bytes).map_err(|_| {
397+
let err = DecodeError {
398+
encoding: content_encoding.to_string(),
399+
data: bytes,
400+
};
401+
io::Error::new(io::ErrorKind::InvalidData, err)
402+
})?)
403+
}
404+
405+
#[cfg(test)]
406+
mod decode_tests {
407+
use super::decode_body;
408+
409+
#[test]
410+
fn utf8() {
411+
let input = "Rød grød med fløde";
412+
assert_eq!(
413+
decode_body(input.as_bytes().to_vec(), Some("utf-8")).unwrap(),
414+
input,
415+
"Parses utf-8"
416+
);
417+
}
418+
419+
#[test]
420+
fn default_utf8() {
421+
let input = "Rød grød med fløde";
422+
assert_eq!(
423+
decode_body(input.as_bytes().to_vec(), None).unwrap(),
424+
input,
425+
"Defaults to utf-8"
426+
);
427+
}
428+
429+
#[test]
430+
fn euc_kr() {
431+
let input = vec![
432+
0xb3, 0xbb, 0x20, 0xc7, 0xb0, 0xc0, 0xb8, 0xb7, 0xce, 0x20, 0xb5, 0xb9, 0xbe, 0xc6,
433+
0xbf, 0xc0, 0xb6, 0xf3, 0x2c, 0x20, 0xb3, 0xbb, 0x20, 0xbe, 0xc8, 0xbf, 0xa1, 0xbc,
434+
0xad, 0x20, 0xc0, 0xe1, 0xb5, 0xe9, 0xb0, 0xc5, 0xb6, 0xf3,
435+
];
436+
437+
let result = decode_body(input, Some("euc-kr"));
438+
if cfg!(feature = "encoding") {
439+
assert_eq!(result.unwrap(), "내 품으로 돌아오라, 내 안에서 잠들거라");
440+
} else {
441+
assert!(result.is_err(), "Only utf-8 is supported");
442+
}
443+
}
444+
}

0 commit comments

Comments
 (0)