diff --git a/CHANGES/2914.doc.rst b/CHANGES/2914.doc.rst new file mode 100644 index 00000000000..25592bf79bc --- /dev/null +++ b/CHANGES/2914.doc.rst @@ -0,0 +1,4 @@ +Improved documentation for middleware by adding warnings and examples about +request body stream consumption. The documentation now clearly explains that +request body streams can only be read once and provides best practices for +sharing parsed request data between middleware and handlers -- by :user:`bdraco`. diff --git a/docs/web_advanced.rst b/docs/web_advanced.rst index 76fc3ea57f1..f182acf11c1 100644 --- a/docs/web_advanced.rst +++ b/docs/web_advanced.rst @@ -568,9 +568,13 @@ A *middleware* is a coroutine that can modify either the request or response. For example, here's a simple *middleware* which appends ``' wink'`` to the response:: - from aiohttp.web import middleware + from aiohttp import web + from typing import Callable, Awaitable - async def middleware(request, handler): + async def middleware( + request: web.Request, + handler: Callable[[web.Request], Awaitable[web.StreamResponse]] + ) -> web.StreamResponse: resp = await handler(request) resp.text = resp.text + ' wink' return resp @@ -619,18 +623,25 @@ post-processing like handling *CORS* and so on. The following code demonstrates middlewares execution order:: from aiohttp import web + from typing import Callable, Awaitable - async def test(request): + async def test(request: web.Request) -> web.Response: print('Handler function called') return web.Response(text="Hello") - async def middleware1(request, handler): + async def middleware1( + request: web.Request, + handler: Callable[[web.Request], Awaitable[web.StreamResponse]] + ) -> web.StreamResponse: print('Middleware 1 called') response = await handler(request) print('Middleware 1 finished') return response - async def middleware2(request, handler): + async def middleware2( + request: web.Request, + handler: Callable[[web.Request], Awaitable[web.StreamResponse]] + ) -> web.StreamResponse: print('Middleware 2 called') response = await handler(request) print('Middleware 2 finished') @@ -649,6 +660,82 @@ Produced output:: Middleware 2 finished Middleware 1 finished +Request Body Stream Consumption +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. warning:: + + When middleware reads the request body (using :meth:`~aiohttp.web.BaseRequest.read`, + :meth:`~aiohttp.web.BaseRequest.text`, :meth:`~aiohttp.web.BaseRequest.json`, or + :meth:`~aiohttp.web.BaseRequest.post`), the body stream is consumed. However, these + high-level methods cache their result, so subsequent calls from the handler or other + middleware will return the same cached value. + + The important distinction is: + + - High-level methods (:meth:`~aiohttp.web.BaseRequest.read`, :meth:`~aiohttp.web.BaseRequest.text`, + :meth:`~aiohttp.web.BaseRequest.json`, :meth:`~aiohttp.web.BaseRequest.post`) cache their + results internally, so they can be called multiple times and will return the same value. + - Direct stream access via :attr:`~aiohttp.web.BaseRequest.content` does NOT have this + caching behavior. Once you read from ``request.content`` directly (e.g., using + ``await request.content.read()``), subsequent reads will return empty bytes. + +Consider this middleware that logs request bodies:: + + from aiohttp import web + from typing import Callable, Awaitable + + async def logging_middleware( + request: web.Request, + handler: Callable[[web.Request], Awaitable[web.StreamResponse]] + ) -> web.StreamResponse: + # This consumes the request body stream + body = await request.text() + print(f"Request body: {body}") + return await handler(request) + + async def handler(request: web.Request) -> web.Response: + # This will return the same value that was read in the middleware + # (i.e., the cached result, not an empty string) + body = await request.text() + return web.Response(text=f"Received: {body}") + +In contrast, when accessing the stream directly (not recommended in middleware):: + + async def stream_middleware( + request: web.Request, + handler: Callable[[web.Request], Awaitable[web.StreamResponse]] + ) -> web.StreamResponse: + # Reading directly from the stream - this consumes it! + data = await request.content.read() + print(f"Stream data: {data}") + return await handler(request) + + async def handler(request: web.Request) -> web.Response: + # This will return empty bytes because the stream was already consumed + data = await request.content.read() + # data will be b'' (empty bytes) + + # However, high-level methods would still work if called for the first time: + # body = await request.text() # This would read from internal cache if available + return web.Response(text=f"Received: {data}") + +When working with raw stream data that needs to be shared between middleware and handlers:: + + async def stream_parsing_middleware( + request: web.Request, + handler: Callable[[web.Request], Awaitable[web.StreamResponse]] + ) -> web.StreamResponse: + # Read stream once and store the data + raw_data = await request.content.read() + request['raw_body'] = raw_data + return await handler(request) + + async def handler(request: web.Request) -> web.Response: + # Access the stored data instead of reading the stream again + raw_data = request.get('raw_body', b'') + return web.Response(body=raw_data) + Example ^^^^^^^