Fix XMLParsedAsHTMLWarning in scrape integration (#159433)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Franck Nijhof <git@frenck.dev>
2026-03-21 03:03:17 +01:00 · 2026-02-18 17:00:49 +01:00
parent bfea04b482
commit 68792f02d4
3 changed files with 175 additions and 1 deletions
--- a/tests/components/scrape/init.py
+++ b/tests/components/scrape/init.py
@@ -39,12 +39,14 @@ class MockRestData:
    ) -> None:
        """Init RestDataMock."""
        self.data: str | None = None
+        self.headers: dict[str, str] | None = None
        self.payload = payload
        self.count = 0

    async def async_update(self, data: bool | None = True) -> None:
        """Update."""
        self.count += 1
+        self.headers = {}
        if self.payload == "test_scrape_sensor":
            self.data = (
                # Default
@@ -74,5 +76,33 @@ class MockRestData:
            self.data = "<div class='return'>secret text</div>"
        if self.payload == "test_scrape_sensor_no_data":
            self.data = None
+        if self.payload == "test_scrape_xml":
+            # XML/RSS content for testing XML parser detection via Content-Type
+            self.headers = {"Content-Type": "application/rss+xml"}
+            self.data = (
+                '<?xml version="1.0" encoding="UTF-8"?>'
+                "<rss><channel><title>Test RSS Feed</title>"
+                "<item><title>Test Item</title><link>https://example.com/item</link></item>"
+                "</channel></rss>"
+            )
+        if self.payload == "test_scrape_xml_fallback":
+            # XML/RSS content with non-XML Content-Type for testing content-based detection
+            self.headers = {"Content-Type": "text/html"}
+            self.data = (
+                '<?xml version="1.0" encoding="UTF-8"?>'
+                "<rss><channel><title>Test RSS Feed</title>"
+                "<item><title>Test Item</title><link>https://example.com/item</link></item>"
+                "</channel></rss>"
+            )
+        if self.payload == "test_scrape_html5_with_xml_declaration":
+            # HTML5 with XML declaration, no Content-Type header, and uppercase tags
+            # Tests: XML stripping, content detection, case-insensitive selectors
+            self.data = (
+                '<?xml version="1.0" encoding="UTF-8"?>\n'
+                "<!DOCTYPE html>\n"
+                "<html><head><TITLE>Test Page</TITLE></head>"
+                "<body><DIV class='current-version'>"
+                "<H1>Current Version: 2021.12.10</H1></DIV></body></html>"
+            )
        if self.count == 3:
            self.data = None
--- a/tests/components/scrape/test_sensor.py
+++ b/tests/components/scrape/test_sensor.py
@@ -75,6 +75,116 @@ async def test_scrape_sensor(hass: HomeAssistant) -> None:
    assert state.state == "Current Version: 2021.12.10"


+async def test_scrape_xml_content_type(
+    hass: HomeAssistant, caplog: pytest.LogCaptureFixture
+) -> None:
+    """Test Scrape sensor with XML Content-Type header uses XML parser."""
+    config = {
+        DOMAIN: [
+            return_integration_config(
+                sensors=[
+                    {"select": "title", "name": "RSS Title"},
+                    # Test <link> tag - HTML parser treats this as self-closing,
+                    # but XML parser correctly parses the content
+                    {"select": "item link", "name": "RSS Link"},
+                ]
+            )
+        ]
+    }
+
+    mocker = MockRestData("test_scrape_xml")
+    with patch(
+        "homeassistant.components.rest.RestData",
+        return_value=mocker,
+    ):
+        assert await async_setup_component(hass, DOMAIN, config)
+        await hass.async_block_till_done()
+
+    # Verify XML Content-Type header is set
+    assert mocker.headers.get("Content-Type") == "application/rss+xml"
+
+    state = hass.states.get("sensor.rss_title")
+    assert state.state == "Test RSS Feed"
+
+    # Verify <link> content is correctly parsed with XML parser
+    link_state = hass.states.get("sensor.rss_link")
+    assert link_state.state == "https://example.com/item"
+
+    assert "XMLParsedAsHTMLWarning" not in caplog.text
+
+
+async def test_scrape_xml_declaration(
+    hass: HomeAssistant, caplog: pytest.LogCaptureFixture
+) -> None:
+    """Test Scrape sensor with XML declaration (no XML Content-Type) uses XML parser."""
+    config = {
+        DOMAIN: [
+            return_integration_config(
+                sensors=[{"select": "title", "name": "RSS Title"}]
+            )
+        ]
+    }
+
+    mocker = MockRestData("test_scrape_xml_fallback")
+    with patch(
+        "homeassistant.components.rest.RestData",
+        return_value=mocker,
+    ):
+        assert await async_setup_component(hass, DOMAIN, config)
+        await hass.async_block_till_done()
+
+    # Verify non-XML Content-Type but XML parser used due to <?xml declaration
+    assert mocker.headers.get("Content-Type") == "text/html"
+
+    state = hass.states.get("sensor.rss_title")
+    assert state.state == "Test RSS Feed"
+    assert "XMLParsedAsHTMLWarning" not in caplog.text
+
+
+async def test_scrape_html5_with_xml_declaration(
+    hass: HomeAssistant, caplog: pytest.LogCaptureFixture
+) -> None:
+    """Test HTML5 with XML declaration strips XML prefix and uses HTML parser.
+
+    This test verifies backward compatibility by testing:
+    - No Content-Type header (relies on content detection)
+    - Uppercase HTML tags with lowercase selectors (case-insensitive matching)
+    - Class selectors work correctly
+    - No XMLParsedAsHTMLWarning is logged
+    """
+    config = {
+        DOMAIN: [
+            return_integration_config(
+                sensors=[
+                    # Lowercase selector matches uppercase <H1> tag
+                    {"select": ".current-version h1", "name": "HA version"},
+                    # Lowercase selector matches uppercase <TITLE> tag
+                    {"select": "title", "name": "Page Title"},
+                ]
+            )
+        ]
+    }
+
+    mocker = MockRestData("test_scrape_html5_with_xml_declaration")
+    with patch(
+        "homeassistant.components.rest.RestData",
+        return_value=mocker,
+    ):
+        assert await async_setup_component(hass, DOMAIN, config)
+        await hass.async_block_till_done()
+
+    # Verify no Content-Type header is set (tests content-based detection)
+    assert "Content-Type" not in mocker.headers
+
+    state = hass.states.get("sensor.ha_version")
+    assert state.state == "Current Version: 2021.12.10"
+
+    title_state = hass.states.get("sensor.page_title")
+    assert title_state.state == "Test Page"
+
+    assert "XMLParsedAsHTMLWarning" not in caplog.text
+
+
 async def test_scrape_sensor_value_template(hass: HomeAssistant) -> None:
    """Test Scrape sensor with value template."""
    config = {