diff --git a/CHANGES/10889.bugfix.rst b/CHANGES/10889.bugfix.rst new file mode 100644 index 00000000000..4bba5595b9e --- /dev/null +++ b/CHANGES/10889.bugfix.rst @@ -0,0 +1,4 @@ +Updated ``Content-Type`` header parsing to return ``application/octet-stream`` when header contains invalid syntax. +See :rfc:`9110#section-8.3-5`. + +-- by :user:`sgaist`. diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 84692200b6e..46547b871de 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -320,6 +320,7 @@ Roman Postnov Rong Zhang Samir Akarioh Samuel Colvin +Samuel Gaist Sean Hunt Sebastian Acuna Sebastian Hanula diff --git a/aiohttp/helpers.py b/aiohttp/helpers.py index 48389264186..4aded6cd7e0 100644 --- a/aiohttp/helpers.py +++ b/aiohttp/helpers.py @@ -20,7 +20,9 @@ from collections import namedtuple from collections.abc import Callable, Iterable, Iterator, Mapping from contextlib import suppress +from email.message import EmailMessage from email.parser import HeaderParser +from email.policy import HTTP from email.utils import parsedate from http.cookies import SimpleCookie from math import ceil @@ -356,14 +358,40 @@ def parse_mimetype(mimetype: str) -> MimeType: ) +class EnsureOctetStream(EmailMessage): + def __init__(self) -> None: + super().__init__() + # https://www.rfc-editor.org/rfc/rfc9110#section-8.3-5 + self.set_default_type("application/octet-stream") + + def get_content_type(self) -> Any: + """Re-implementation from Message + + Returns application/octet-stream in place of plain/text when + value is wrong. + + The way this class is used guarantees that content-type will + be present so simplify the checks wrt to the base implementation. + """ + value = self.get("content-type", "").lower() + + # Based on the implementation of _splitparam in the standard library + ctype, _, _ = value.partition(";") + ctype = ctype.strip() + if ctype.count("/") != 1: + return self.get_default_type() + return ctype + + @functools.lru_cache(maxsize=56) def parse_content_type(raw: str) -> tuple[str, MappingProxyType[str, str]]: """Parse Content-Type header. Returns a tuple of the parsed content type and a - MappingProxyType of parameters. + MappingProxyType of parameters. The default returned value + is `application/octet-stream` """ - msg = HeaderParser().parsestr(f"Content-Type: {raw}") + msg = HeaderParser(EnsureOctetStream, policy=HTTP).parsestr(f"Content-Type: {raw}") content_type = msg.get_content_type() params = msg.get_params(()) content_dict = dict(params[1:]) # First element is content type again diff --git a/docs/client_reference.rst b/docs/client_reference.rst index a262bd47a1a..50d158b5c2a 100644 --- a/docs/client_reference.rst +++ b/docs/client_reference.rst @@ -1550,16 +1550,14 @@ Response object .. note:: - Returns value is ``'application/octet-stream'`` if no - Content-Type header present in HTTP headers according to - :rfc:`9110`. If the *Content-Type* header is invalid (e.g., ``jpg`` - instead of ``image/jpeg``), the value is ``text/plain`` by default - according to :rfc:`2045`. To see the original header check - ``resp.headers['CONTENT-TYPE']``. + Returns ``'application/octet-stream'`` if no Content-Type header + is present or the value contains invalid syntax according to + :rfc:`9110`. To see the original header check + ``resp.headers["Content-Type"]``. To make sure Content-Type header is not present in the server reply, use :attr:`headers` or :attr:`raw_headers`, e.g. - ``'CONTENT-TYPE' not in resp.headers``. + ``'Content-Type' not in resp.headers``. .. attribute:: charset diff --git a/requirements/constraints.txt b/requirements/constraints.txt index fc2518717d7..463f31e2324 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -109,7 +109,7 @@ jinja2==3.1.6 # via # sphinx # towncrier -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via rich markupsafe==3.0.3 # via jinja2 diff --git a/requirements/dev.txt b/requirements/dev.txt index db6a3d6d397..d26e7ebfb6f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -107,7 +107,7 @@ jinja2==3.1.6 # via # sphinx # towncrier -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via rich markupsafe==3.0.3 # via jinja2 diff --git a/requirements/lint.txt b/requirements/lint.txt index 2c34197b353..fd33c6e53fa 100644 --- a/requirements/lint.txt +++ b/requirements/lint.txt @@ -43,7 +43,7 @@ iniconfig==2.1.0 # via pytest isal==1.7.2 # via -r requirements/lint.in -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via rich mdurl==0.1.2 # via markdown-it-py diff --git a/requirements/test.txt b/requirements/test.txt index 7b311b63e59..4269ff2800f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -55,7 +55,7 @@ iniconfig==2.1.0 # via pytest isal==1.7.2 ; python_version < "3.14" # via -r requirements/test-common.in -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via rich mdurl==0.1.2 # via markdown-it-py diff --git a/tests/test_helpers.py b/tests/test_helpers.py index fc9069729d5..80adbb7dd15 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -7,6 +7,7 @@ from collections.abc import Iterator from math import ceil, modf from pathlib import Path +from types import MappingProxyType from unittest import mock from urllib.request import getproxies_environment @@ -81,6 +82,30 @@ def test_parse_mimetype(mimetype: str, expected: helpers.MimeType) -> None: assert result == expected +# ------------------- parse_content_type ------------------------------ + + +@pytest.mark.parametrize( + "content_type, expected", + [ + ( + "text/plain", + ("text/plain", MultiDictProxy(MultiDict())), + ), + ( + "wrong", + ("application/octet-stream", MultiDictProxy(MultiDict())), + ), + ], +) +def test_parse_content_type( + content_type: str, expected: tuple[str, MappingProxyType[str, str]] +) -> None: + result = helpers.parse_content_type(content_type) + + assert result == expected + + # ------------------- guess_filename ---------------------------------- diff --git a/tests/test_web_response.py b/tests/test_web_response.py index 8e2864fc5fc..57c8fbf9c83 100644 --- a/tests/test_web_response.py +++ b/tests/test_web_response.py @@ -1023,10 +1023,10 @@ def test_ctor_content_type_with_extra() -> None: assert resp.headers["content-type"] == "text/plain; version=0.0.4; charset=utf-8" -def test_invalid_content_type_parses_to_text_plain() -> None: +def test_invalid_content_type_parses_to_application_octect_stream() -> None: resp = web.Response(text="test test", content_type="jpeg") - assert resp.content_type == "text/plain" + assert resp.content_type == "application/octet-stream" assert resp.headers["content-type"] == "jpeg; charset=utf-8"