Use appropriate charset in body_string() (http-rs#108)

goto-bus-stop · web-flow · commit 24a3c01c4a89 · 2019-11-22T14:37:15.000+01:00
* Sketch encoding-aware body_string() implementation

* Add some tests for the encodings

* Fix ownership of byte array in utf-8-only path

* make wasm32 impl of decode_body compile

* Error if "encoding" feature is disabled and body is not utf-8

* custom Debug impl for DecodeError to truncate the `data` bytes

* document encoding behaviour of body_string()
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,12 +12,13 @@ readme = "README.md"
 edition = "2018"
 
 [features]
-default = ["native-client", "middleware-logger"]
+default = ["native-client", "middleware-logger", "encoding"]
 native-client = ["curl-client", "wasm-client"]
 hyper-client = ["hyper", "runtime", "runtime-raw", "runtime-tokio" ]
 curl-client = ["isahc"]
 wasm-client = ["js-sys", "web-sys", "wasm-bindgen", "wasm-bindgen-futures"]
 middleware-logger = []
+encoding = ["encoding_rs"]
 
 [dependencies]
 futures-preview = { version = "0.3.0-alpha.19", features = ["compat", "io-compat"] }
@@ -30,8 +31,11 @@ serde_json = "1.0.40"
 serde_urlencoded = "0.6.1"
 url = "2.0.0"
 
-# isahc-client
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+# encoding
+encoding_rs = { version = "0.8.20", optional = true }
+
+# isahc-client
 isahc = { version = "0.7", optional = true, default-features = false, features = ["http2"]  }
 
 # hyper-client
@@ -63,6 +67,7 @@ features = [
     "RequestMode",
     "RequestRedirect",
     "Response",
+    "TextDecoder",
     "Window",
 ]
 
diff --git a/src/lib.rs b/src/lib.rs
@@ -88,7 +88,7 @@ pub use url;
 
 pub use client::Client;
 pub use request::Request;
-pub use response::Response;
+pub use response::{DecodeError, Response};
 
 #[cfg(feature = "native-client")]
 mod one_off;
diff --git a/src/response.rs b/src/response.rs
@@ -144,12 +144,21 @@ impl Response {
     /// This method can be called after the body has already been read, but will
     /// produce an empty buffer.
     ///
+    /// # Encodings
+    ///
+    /// If the "encoding" feature is enabled, this method tries to decode the body
+    /// with the encoding that is specified in the Content-Type header. If the header
+    /// does not specify an encoding, UTF-8 is assumed. If the "encoding" feature is
+    /// disabled, Surf only supports reading UTF-8 response bodies. The "encoding"
+    /// feature is enabled by default.
+    ///
     /// # Errors
     ///
     /// Any I/O error encountered while reading the body is immediately returned
     /// as an `Err`.
     ///
-    /// If the body cannot be interpreted as valid UTF-8, an `Err` is returned.
+    /// If the body cannot be interpreted because the encoding is unsupported or
+    /// incorrect, an `Err` is returned.
     ///
     /// # Examples
     ///
@@ -162,7 +171,12 @@ impl Response {
     /// ```
     pub async fn body_string(&mut self) -> Result<String, Exception> {
         let bytes = self.body_bytes().await?;
-        Ok(String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?)
+        let mime = self.mime();
+        let claimed_encoding = mime
+            .as_ref()
+            .and_then(|mime| mime.get_param("charset"))
+            .map(|name| name.as_str());
+        decode_body(bytes, claimed_encoding)
     }
 
     /// Reads and deserialized the entire request body from json.
@@ -246,3 +260,185 @@ impl fmt::Debug for Response {
             .finish()
     }
 }
+
+/// An error occurred while decoding a response body to a string.
+///
+/// The error carries the encoding that was used to attempt to decode the body, and the raw byte
+/// contents of the body. This can be used to treat un-decodable bodies specially or to implement a
+/// fallback parsing strategy.
+#[derive(Clone)]
+pub struct DecodeError {
+    /// The name of the encoding that was used to try to decode the input.
+    pub encoding: String,
+    /// The input data as bytes.
+    pub data: Vec<u8>,
+}
+
+// Override debug output so you don't get each individual byte in `data` printed out separately,
+// because it can be many megabytes large. The actual content is not that interesting anyways
+// and can be accessed manually if it is required.
+impl fmt::Debug for DecodeError {
+    #[allow(missing_doc_code_examples)]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("DecodeError")
+            .field("encoding", &self.encoding)
+            // Perhaps we can output the first N bytes of the response in the future
+            .field("data", &format!("{} bytes", self.data.len()))
+            .finish()
+    }
+}
+
+impl fmt::Display for DecodeError {
+    #[allow(missing_doc_code_examples)]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "could not decode body as {}", &self.encoding)
+    }
+}
+
+impl std::error::Error for DecodeError {}
+
+/// Check if an encoding label refers to the UTF-8 encoding.
+#[allow(dead_code)]
+fn is_utf8_encoding(encoding_label: &str) -> bool {
+    encoding_label.eq_ignore_ascii_case("utf-8")
+        || encoding_label.eq_ignore_ascii_case("utf8")
+        || encoding_label.eq_ignore_ascii_case("unicode-1-1-utf-8")
+}
+
+/// Decode a response body as utf-8.
+///
+/// # Errors
+///
+/// If the body cannot be decoded as utf-8, this function returns an `std::io::Error` of kind
+/// `std::io::ErrorKind::InvalidData`, carrying a `DecodeError` struct.
+#[cfg(not(feature = "encoding"))]
+fn decode_body(bytes: Vec<u8>, content_encoding: Option<&str>) -> Result<String, Exception> {
+    if is_utf8_encoding(content_encoding.unwrap_or("utf-8")) {
+        Ok(String::from_utf8(bytes).map_err(|err| {
+            let err = DecodeError {
+                encoding: "utf-8".to_string(),
+                data: err.into_bytes(),
+            };
+            io::Error::new(io::ErrorKind::InvalidData, err)
+        })?)
+    } else {
+        let err = DecodeError {
+            encoding: "utf-8".to_string(),
+            data: bytes,
+        };
+        Err(io::Error::new(io::ErrorKind::InvalidData, err).into())
+    }
+}
+
+/// Decode a response body as the given content type.
+///
+/// If the input bytes are valid utf-8, this does not make a copy.
+///
+/// # Errors
+///
+/// If an unsupported encoding is requested, or the body does not conform to the requested
+/// encoding, this function returns an `std::io::Error` of kind `std::io::ErrorKind::InvalidData`,
+/// carrying a `DecodeError` struct.
+#[cfg(all(feature = "encoding", not(target_arch = "wasm32")))]
+fn decode_body(bytes: Vec<u8>, content_encoding: Option<&str>) -> Result<String, Exception> {
+    use encoding_rs::Encoding;
+    use std::borrow::Cow;
+
+    let content_encoding = content_encoding.unwrap_or("utf-8");
+    if let Some(encoding) = Encoding::for_label(content_encoding.as_bytes()) {
+        let (decoded, encoding_used, failed) = encoding.decode(&bytes);
+        if failed {
+            let err = DecodeError {
+                encoding: encoding_used.name().into(),
+                data: bytes,
+            };
+            Err(io::Error::new(io::ErrorKind::InvalidData, err))?
+        } else {
+            Ok(match decoded {
+                // If encoding_rs returned a `Cow::Borrowed`, the bytes are guaranteed to be valid
+                // UTF-8, by virtue of being UTF-8 or being in the subset of ASCII that is the same
+                // in UTF-8.
+                Cow::Borrowed(_) => unsafe { String::from_utf8_unchecked(bytes) },
+                Cow::Owned(string) => string,
+            })
+        }
+    } else {
+        let err = DecodeError {
+            encoding: content_encoding.to_string(),
+            data: bytes,
+        };
+        Err(io::Error::new(io::ErrorKind::InvalidData, err))?
+    }
+}
+
+/// Decode a response body as the given content type.
+///
+/// This always makes a copy. (It could be optimized to avoid the copy if the encoding is utf-8.)
+///
+/// # Errors
+///
+/// If an unsupported encoding is requested, or the body does not conform to the requested
+/// encoding, this function returns an `std::io::Error` of kind `std::io::ErrorKind::InvalidData`,
+/// carrying a `DecodeError` struct.
+#[cfg(all(feature = "encoding", target_arch = "wasm32"))]
+fn decode_body(mut bytes: Vec<u8>, content_encoding: Option<&str>) -> Result<String, Exception> {
+    use web_sys::TextDecoder;
+
+    // Encoding names are always valid ASCII, so we can avoid including casing mapping tables
+    let content_encoding = content_encoding.unwrap_or("utf-8").to_ascii_lowercase();
+    if is_utf8_encoding(content_encoding) {
+        return String::from_utf8(bytes)
+            .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err).into());
+    }
+
+    let decoder = TextDecoder::new_with_label(&content_encoding).unwrap();
+
+    Ok(decoder.decode_with_u8_array(&mut bytes).map_err(|_| {
+        let err = DecodeError {
+            encoding: content_encoding.to_string(),
+            data: bytes,
+        };
+        io::Error::new(io::ErrorKind::InvalidData, err)
+    })?)
+}
+
+#[cfg(test)]
+mod decode_tests {
+    use super::decode_body;
+
+    #[test]
+    fn utf8() {
+        let input = "Rød grød med fløde";
+        assert_eq!(
+            decode_body(input.as_bytes().to_vec(), Some("utf-8")).unwrap(),
+            input,
+            "Parses utf-8"
+        );
+    }
+
+    #[test]
+    fn default_utf8() {
+        let input = "Rød grød med fløde";
+        assert_eq!(
+            decode_body(input.as_bytes().to_vec(), None).unwrap(),
+            input,
+            "Defaults to utf-8"
+        );
+    }
+
+    #[test]
+    fn euc_kr() {
+        let input = vec![
+            0xb3, 0xbb, 0x20, 0xc7, 0xb0, 0xc0, 0xb8, 0xb7, 0xce, 0x20, 0xb5, 0xb9, 0xbe, 0xc6,
+            0xbf, 0xc0, 0xb6, 0xf3, 0x2c, 0x20, 0xb3, 0xbb, 0x20, 0xbe, 0xc8, 0xbf, 0xa1, 0xbc,
+            0xad, 0x20, 0xc0, 0xe1, 0xb5, 0xe9, 0xb0, 0xc5, 0xb6, 0xf3,
+        ];
+
+        let result = decode_body(input, Some("euc-kr"));
+        if cfg!(feature = "encoding") {
+            assert_eq!(result.unwrap(), "내 품으로 돌아오라, 내 안에서 잠들거라");
+        } else {
+            assert!(result.is_err(), "Only utf-8 is supported");
+        }
+    }
+}