Fix XMLParsedAsHTMLWarning in scrape integration (#159433)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Franck Nijhof <git@frenck.dev>
This commit is contained in:
Ivan Dlugos
2026-02-18 17:00:49 +01:00
committed by GitHub
parent bfea04b482
commit 68792f02d4
3 changed files with 175 additions and 1 deletions

View File

@@ -39,12 +39,14 @@ class MockRestData:
) -> None:
"""Init RestDataMock."""
self.data: str | None = None
self.headers: dict[str, str] | None = None
self.payload = payload
self.count = 0
async def async_update(self, data: bool | None = True) -> None:
"""Update."""
self.count += 1
self.headers = {}
if self.payload == "test_scrape_sensor":
self.data = (
# Default
@@ -74,5 +76,33 @@ class MockRestData:
self.data = "<div class='return'>secret text</div>"
if self.payload == "test_scrape_sensor_no_data":
self.data = None
if self.payload == "test_scrape_xml":
# XML/RSS content for testing XML parser detection via Content-Type
self.headers = {"Content-Type": "application/rss+xml"}
self.data = (
'<?xml version="1.0" encoding="UTF-8"?>'
"<rss><channel><title>Test RSS Feed</title>"
"<item><title>Test Item</title><link>https://example.com/item</link></item>"
"</channel></rss>"
)
if self.payload == "test_scrape_xml_fallback":
# XML/RSS content with non-XML Content-Type for testing content-based detection
self.headers = {"Content-Type": "text/html"}
self.data = (
'<?xml version="1.0" encoding="UTF-8"?>'
"<rss><channel><title>Test RSS Feed</title>"
"<item><title>Test Item</title><link>https://example.com/item</link></item>"
"</channel></rss>"
)
if self.payload == "test_scrape_html5_with_xml_declaration":
# HTML5 with XML declaration, no Content-Type header, and uppercase tags
# Tests: XML stripping, content detection, case-insensitive selectors
self.data = (
'<?xml version="1.0" encoding="UTF-8"?>\n'
"<!DOCTYPE html>\n"
"<html><head><TITLE>Test Page</TITLE></head>"
"<body><DIV class='current-version'>"
"<H1>Current Version: 2021.12.10</H1></DIV></body></html>"
)
if self.count == 3:
self.data = None

View File

@@ -75,6 +75,116 @@ async def test_scrape_sensor(hass: HomeAssistant) -> None:
assert state.state == "Current Version: 2021.12.10"
async def test_scrape_xml_content_type(
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
) -> None:
"""Test Scrape sensor with XML Content-Type header uses XML parser."""
config = {
DOMAIN: [
return_integration_config(
sensors=[
{"select": "title", "name": "RSS Title"},
# Test <link> tag - HTML parser treats this as self-closing,
# but XML parser correctly parses the content
{"select": "item link", "name": "RSS Link"},
]
)
]
}
mocker = MockRestData("test_scrape_xml")
with patch(
"homeassistant.components.rest.RestData",
return_value=mocker,
):
assert await async_setup_component(hass, DOMAIN, config)
await hass.async_block_till_done()
# Verify XML Content-Type header is set
assert mocker.headers.get("Content-Type") == "application/rss+xml"
state = hass.states.get("sensor.rss_title")
assert state.state == "Test RSS Feed"
# Verify <link> content is correctly parsed with XML parser
link_state = hass.states.get("sensor.rss_link")
assert link_state.state == "https://example.com/item"
assert "XMLParsedAsHTMLWarning" not in caplog.text
async def test_scrape_xml_declaration(
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
) -> None:
"""Test Scrape sensor with XML declaration (no XML Content-Type) uses XML parser."""
config = {
DOMAIN: [
return_integration_config(
sensors=[{"select": "title", "name": "RSS Title"}]
)
]
}
mocker = MockRestData("test_scrape_xml_fallback")
with patch(
"homeassistant.components.rest.RestData",
return_value=mocker,
):
assert await async_setup_component(hass, DOMAIN, config)
await hass.async_block_till_done()
# Verify non-XML Content-Type but XML parser used due to <?xml declaration
assert mocker.headers.get("Content-Type") == "text/html"
state = hass.states.get("sensor.rss_title")
assert state.state == "Test RSS Feed"
assert "XMLParsedAsHTMLWarning" not in caplog.text
async def test_scrape_html5_with_xml_declaration(
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
) -> None:
"""Test HTML5 with XML declaration strips XML prefix and uses HTML parser.
This test verifies backward compatibility by testing:
- No Content-Type header (relies on content detection)
- Uppercase HTML tags with lowercase selectors (case-insensitive matching)
- Class selectors work correctly
- No XMLParsedAsHTMLWarning is logged
"""
config = {
DOMAIN: [
return_integration_config(
sensors=[
# Lowercase selector matches uppercase <H1> tag
{"select": ".current-version h1", "name": "HA version"},
# Lowercase selector matches uppercase <TITLE> tag
{"select": "title", "name": "Page Title"},
]
)
]
}
mocker = MockRestData("test_scrape_html5_with_xml_declaration")
with patch(
"homeassistant.components.rest.RestData",
return_value=mocker,
):
assert await async_setup_component(hass, DOMAIN, config)
await hass.async_block_till_done()
# Verify no Content-Type header is set (tests content-based detection)
assert "Content-Type" not in mocker.headers
state = hass.states.get("sensor.ha_version")
assert state.state == "Current Version: 2021.12.10"
title_state = hass.states.get("sensor.page_title")
assert title_state.state == "Test Page"
assert "XMLParsedAsHTMLWarning" not in caplog.text
async def test_scrape_sensor_value_template(hass: HomeAssistant) -> None:
"""Test Scrape sensor with value template."""
config = {