@@ -144,12 +144,21 @@ impl Response {
144144 /// This method can be called after the body has already been read, but will
145145 /// produce an empty buffer.
146146 ///
147+ /// # Encodings
148+ ///
149+ /// If the "encoding" feature is enabled, this method tries to decode the body
150+ /// with the encoding that is specified in the Content-Type header. If the header
151+ /// does not specify an encoding, UTF-8 is assumed. If the "encoding" feature is
152+ /// disabled, Surf only supports reading UTF-8 response bodies. The "encoding"
153+ /// feature is enabled by default.
154+ ///
147155 /// # Errors
148156 ///
149157 /// Any I/O error encountered while reading the body is immediately returned
150158 /// as an `Err`.
151159 ///
152- /// If the body cannot be interpreted as valid UTF-8, an `Err` is returned.
160+ /// If the body cannot be interpreted because the encoding is unsupported or
161+ /// incorrect, an `Err` is returned.
153162 ///
154163 /// # Examples
155164 ///
@@ -162,7 +171,12 @@ impl Response {
162171 /// ```
163172 pub async fn body_string ( & mut self ) -> Result < String , Exception > {
164173 let bytes = self . body_bytes ( ) . await ?;
165- Ok ( String :: from_utf8 ( bytes) . map_err ( |e| io:: Error :: new ( io:: ErrorKind :: InvalidData , e) ) ?)
174+ let mime = self . mime ( ) ;
175+ let claimed_encoding = mime
176+ . as_ref ( )
177+ . and_then ( |mime| mime. get_param ( "charset" ) )
178+ . map ( |name| name. as_str ( ) ) ;
179+ decode_body ( bytes, claimed_encoding)
166180 }
167181
168182 /// Reads and deserialized the entire request body from json.
@@ -246,3 +260,185 @@ impl fmt::Debug for Response {
246260 . finish ( )
247261 }
248262}
263+
264+ /// An error occurred while decoding a response body to a string.
265+ ///
266+ /// The error carries the encoding that was used to attempt to decode the body, and the raw byte
267+ /// contents of the body. This can be used to treat un-decodable bodies specially or to implement a
268+ /// fallback parsing strategy.
269+ #[ derive( Clone ) ]
270+ pub struct DecodeError {
271+ /// The name of the encoding that was used to try to decode the input.
272+ pub encoding : String ,
273+ /// The input data as bytes.
274+ pub data : Vec < u8 > ,
275+ }
276+
277+ // Override debug output so you don't get each individual byte in `data` printed out separately,
278+ // because it can be many megabytes large. The actual content is not that interesting anyways
279+ // and can be accessed manually if it is required.
280+ impl fmt:: Debug for DecodeError {
281+ #[ allow( missing_doc_code_examples) ]
282+ fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
283+ f. debug_struct ( "DecodeError" )
284+ . field ( "encoding" , & self . encoding )
285+ // Perhaps we can output the first N bytes of the response in the future
286+ . field ( "data" , & format ! ( "{} bytes" , self . data. len( ) ) )
287+ . finish ( )
288+ }
289+ }
290+
291+ impl fmt:: Display for DecodeError {
292+ #[ allow( missing_doc_code_examples) ]
293+ fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
294+ write ! ( f, "could not decode body as {}" , & self . encoding)
295+ }
296+ }
297+
298+ impl std:: error:: Error for DecodeError { }
299+
300+ /// Check if an encoding label refers to the UTF-8 encoding.
301+ #[ allow( dead_code) ]
302+ fn is_utf8_encoding ( encoding_label : & str ) -> bool {
303+ encoding_label. eq_ignore_ascii_case ( "utf-8" )
304+ || encoding_label. eq_ignore_ascii_case ( "utf8" )
305+ || encoding_label. eq_ignore_ascii_case ( "unicode-1-1-utf-8" )
306+ }
307+
308+ /// Decode a response body as utf-8.
309+ ///
310+ /// # Errors
311+ ///
312+ /// If the body cannot be decoded as utf-8, this function returns an `std::io::Error` of kind
313+ /// `std::io::ErrorKind::InvalidData`, carrying a `DecodeError` struct.
314+ #[ cfg( not( feature = "encoding" ) ) ]
315+ fn decode_body ( bytes : Vec < u8 > , content_encoding : Option < & str > ) -> Result < String , Exception > {
316+ if is_utf8_encoding ( content_encoding. unwrap_or ( "utf-8" ) ) {
317+ Ok ( String :: from_utf8 ( bytes) . map_err ( |err| {
318+ let err = DecodeError {
319+ encoding : "utf-8" . to_string ( ) ,
320+ data : err. into_bytes ( ) ,
321+ } ;
322+ io:: Error :: new ( io:: ErrorKind :: InvalidData , err)
323+ } ) ?)
324+ } else {
325+ let err = DecodeError {
326+ encoding : "utf-8" . to_string ( ) ,
327+ data : bytes,
328+ } ;
329+ Err ( io:: Error :: new ( io:: ErrorKind :: InvalidData , err) . into ( ) )
330+ }
331+ }
332+
333+ /// Decode a response body as the given content type.
334+ ///
335+ /// If the input bytes are valid utf-8, this does not make a copy.
336+ ///
337+ /// # Errors
338+ ///
339+ /// If an unsupported encoding is requested, or the body does not conform to the requested
340+ /// encoding, this function returns an `std::io::Error` of kind `std::io::ErrorKind::InvalidData`,
341+ /// carrying a `DecodeError` struct.
342+ #[ cfg( all( feature = "encoding" , not( target_arch = "wasm32" ) ) ) ]
343+ fn decode_body ( bytes : Vec < u8 > , content_encoding : Option < & str > ) -> Result < String , Exception > {
344+ use encoding_rs:: Encoding ;
345+ use std:: borrow:: Cow ;
346+
347+ let content_encoding = content_encoding. unwrap_or ( "utf-8" ) ;
348+ if let Some ( encoding) = Encoding :: for_label ( content_encoding. as_bytes ( ) ) {
349+ let ( decoded, encoding_used, failed) = encoding. decode ( & bytes) ;
350+ if failed {
351+ let err = DecodeError {
352+ encoding : encoding_used. name ( ) . into ( ) ,
353+ data : bytes,
354+ } ;
355+ Err ( io:: Error :: new ( io:: ErrorKind :: InvalidData , err) ) ?
356+ } else {
357+ Ok ( match decoded {
358+ // If encoding_rs returned a `Cow::Borrowed`, the bytes are guaranteed to be valid
359+ // UTF-8, by virtue of being UTF-8 or being in the subset of ASCII that is the same
360+ // in UTF-8.
361+ Cow :: Borrowed ( _) => unsafe { String :: from_utf8_unchecked ( bytes) } ,
362+ Cow :: Owned ( string) => string,
363+ } )
364+ }
365+ } else {
366+ let err = DecodeError {
367+ encoding : content_encoding. to_string ( ) ,
368+ data : bytes,
369+ } ;
370+ Err ( io:: Error :: new ( io:: ErrorKind :: InvalidData , err) ) ?
371+ }
372+ }
373+
374+ /// Decode a response body as the given content type.
375+ ///
376+ /// This always makes a copy. (It could be optimized to avoid the copy if the encoding is utf-8.)
377+ ///
378+ /// # Errors
379+ ///
380+ /// If an unsupported encoding is requested, or the body does not conform to the requested
381+ /// encoding, this function returns an `std::io::Error` of kind `std::io::ErrorKind::InvalidData`,
382+ /// carrying a `DecodeError` struct.
383+ #[ cfg( all( feature = "encoding" , target_arch = "wasm32" ) ) ]
384+ fn decode_body ( mut bytes : Vec < u8 > , content_encoding : Option < & str > ) -> Result < String , Exception > {
385+ use web_sys:: TextDecoder ;
386+
387+ // Encoding names are always valid ASCII, so we can avoid including casing mapping tables
388+ let content_encoding = content_encoding. unwrap_or ( "utf-8" ) . to_ascii_lowercase ( ) ;
389+ if is_utf8_encoding ( content_encoding) {
390+ return String :: from_utf8 ( bytes)
391+ . map_err ( |err| io:: Error :: new ( io:: ErrorKind :: InvalidData , err) . into ( ) ) ;
392+ }
393+
394+ let decoder = TextDecoder :: new_with_label ( & content_encoding) . unwrap ( ) ;
395+
396+ Ok ( decoder. decode_with_u8_array ( & mut bytes) . map_err ( |_| {
397+ let err = DecodeError {
398+ encoding : content_encoding. to_string ( ) ,
399+ data : bytes,
400+ } ;
401+ io:: Error :: new ( io:: ErrorKind :: InvalidData , err)
402+ } ) ?)
403+ }
404+
405+ #[ cfg( test) ]
406+ mod decode_tests {
407+ use super :: decode_body;
408+
409+ #[ test]
410+ fn utf8 ( ) {
411+ let input = "Rød grød med fløde" ;
412+ assert_eq ! (
413+ decode_body( input. as_bytes( ) . to_vec( ) , Some ( "utf-8" ) ) . unwrap( ) ,
414+ input,
415+ "Parses utf-8"
416+ ) ;
417+ }
418+
419+ #[ test]
420+ fn default_utf8 ( ) {
421+ let input = "Rød grød med fløde" ;
422+ assert_eq ! (
423+ decode_body( input. as_bytes( ) . to_vec( ) , None ) . unwrap( ) ,
424+ input,
425+ "Defaults to utf-8"
426+ ) ;
427+ }
428+
429+ #[ test]
430+ fn euc_kr ( ) {
431+ let input = vec ! [
432+ 0xb3 , 0xbb , 0x20 , 0xc7 , 0xb0 , 0xc0 , 0xb8 , 0xb7 , 0xce , 0x20 , 0xb5 , 0xb9 , 0xbe , 0xc6 ,
433+ 0xbf , 0xc0 , 0xb6 , 0xf3 , 0x2c , 0x20 , 0xb3 , 0xbb , 0x20 , 0xbe , 0xc8 , 0xbf , 0xa1 , 0xbc ,
434+ 0xad , 0x20 , 0xc0 , 0xe1 , 0xb5 , 0xe9 , 0xb0 , 0xc5 , 0xb6 , 0xf3 ,
435+ ] ;
436+
437+ let result = decode_body ( input, Some ( "euc-kr" ) ) ;
438+ if cfg ! ( feature = "encoding" ) {
439+ assert_eq ! ( result. unwrap( ) , "내 품으로 돌아오라, 내 안에서 잠들거라" ) ;
440+ } else {
441+ assert ! ( result. is_err( ) , "Only utf-8 is supported" ) ;
442+ }
443+ }
444+ }
0 commit comments