mirror of
https://github.com/Electric-Special/ha-core.git
synced 2026-03-21 03:03:17 +01:00
Fix XMLParsedAsHTMLWarning in scrape integration (#159433)
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Franck Nijhof <git@frenck.dev>
This commit is contained in:
@@ -39,12 +39,14 @@ class MockRestData:
|
||||
) -> None:
|
||||
"""Init RestDataMock."""
|
||||
self.data: str | None = None
|
||||
self.headers: dict[str, str] | None = None
|
||||
self.payload = payload
|
||||
self.count = 0
|
||||
|
||||
async def async_update(self, data: bool | None = True) -> None:
|
||||
"""Update."""
|
||||
self.count += 1
|
||||
self.headers = {}
|
||||
if self.payload == "test_scrape_sensor":
|
||||
self.data = (
|
||||
# Default
|
||||
@@ -74,5 +76,33 @@ class MockRestData:
|
||||
self.data = "<div class='return'>secret text</div>"
|
||||
if self.payload == "test_scrape_sensor_no_data":
|
||||
self.data = None
|
||||
if self.payload == "test_scrape_xml":
|
||||
# XML/RSS content for testing XML parser detection via Content-Type
|
||||
self.headers = {"Content-Type": "application/rss+xml"}
|
||||
self.data = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>'
|
||||
"<rss><channel><title>Test RSS Feed</title>"
|
||||
"<item><title>Test Item</title><link>https://example.com/item</link></item>"
|
||||
"</channel></rss>"
|
||||
)
|
||||
if self.payload == "test_scrape_xml_fallback":
|
||||
# XML/RSS content with non-XML Content-Type for testing content-based detection
|
||||
self.headers = {"Content-Type": "text/html"}
|
||||
self.data = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>'
|
||||
"<rss><channel><title>Test RSS Feed</title>"
|
||||
"<item><title>Test Item</title><link>https://example.com/item</link></item>"
|
||||
"</channel></rss>"
|
||||
)
|
||||
if self.payload == "test_scrape_html5_with_xml_declaration":
|
||||
# HTML5 with XML declaration, no Content-Type header, and uppercase tags
|
||||
# Tests: XML stripping, content detection, case-insensitive selectors
|
||||
self.data = (
|
||||
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||
"<!DOCTYPE html>\n"
|
||||
"<html><head><TITLE>Test Page</TITLE></head>"
|
||||
"<body><DIV class='current-version'>"
|
||||
"<H1>Current Version: 2021.12.10</H1></DIV></body></html>"
|
||||
)
|
||||
if self.count == 3:
|
||||
self.data = None
|
||||
|
||||
@@ -75,6 +75,116 @@ async def test_scrape_sensor(hass: HomeAssistant) -> None:
|
||||
assert state.state == "Current Version: 2021.12.10"
|
||||
|
||||
|
||||
async def test_scrape_xml_content_type(
|
||||
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
"""Test Scrape sensor with XML Content-Type header uses XML parser."""
|
||||
config = {
|
||||
DOMAIN: [
|
||||
return_integration_config(
|
||||
sensors=[
|
||||
{"select": "title", "name": "RSS Title"},
|
||||
# Test <link> tag - HTML parser treats this as self-closing,
|
||||
# but XML parser correctly parses the content
|
||||
{"select": "item link", "name": "RSS Link"},
|
||||
]
|
||||
)
|
||||
]
|
||||
}
|
||||
|
||||
mocker = MockRestData("test_scrape_xml")
|
||||
with patch(
|
||||
"homeassistant.components.rest.RestData",
|
||||
return_value=mocker,
|
||||
):
|
||||
assert await async_setup_component(hass, DOMAIN, config)
|
||||
await hass.async_block_till_done()
|
||||
|
||||
# Verify XML Content-Type header is set
|
||||
assert mocker.headers.get("Content-Type") == "application/rss+xml"
|
||||
|
||||
state = hass.states.get("sensor.rss_title")
|
||||
assert state.state == "Test RSS Feed"
|
||||
|
||||
# Verify <link> content is correctly parsed with XML parser
|
||||
link_state = hass.states.get("sensor.rss_link")
|
||||
assert link_state.state == "https://example.com/item"
|
||||
|
||||
assert "XMLParsedAsHTMLWarning" not in caplog.text
|
||||
|
||||
|
||||
async def test_scrape_xml_declaration(
|
||||
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
"""Test Scrape sensor with XML declaration (no XML Content-Type) uses XML parser."""
|
||||
config = {
|
||||
DOMAIN: [
|
||||
return_integration_config(
|
||||
sensors=[{"select": "title", "name": "RSS Title"}]
|
||||
)
|
||||
]
|
||||
}
|
||||
|
||||
mocker = MockRestData("test_scrape_xml_fallback")
|
||||
with patch(
|
||||
"homeassistant.components.rest.RestData",
|
||||
return_value=mocker,
|
||||
):
|
||||
assert await async_setup_component(hass, DOMAIN, config)
|
||||
await hass.async_block_till_done()
|
||||
|
||||
# Verify non-XML Content-Type but XML parser used due to <?xml declaration
|
||||
assert mocker.headers.get("Content-Type") == "text/html"
|
||||
|
||||
state = hass.states.get("sensor.rss_title")
|
||||
assert state.state == "Test RSS Feed"
|
||||
assert "XMLParsedAsHTMLWarning" not in caplog.text
|
||||
|
||||
|
||||
async def test_scrape_html5_with_xml_declaration(
|
||||
hass: HomeAssistant, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
"""Test HTML5 with XML declaration strips XML prefix and uses HTML parser.
|
||||
|
||||
This test verifies backward compatibility by testing:
|
||||
- No Content-Type header (relies on content detection)
|
||||
- Uppercase HTML tags with lowercase selectors (case-insensitive matching)
|
||||
- Class selectors work correctly
|
||||
- No XMLParsedAsHTMLWarning is logged
|
||||
"""
|
||||
config = {
|
||||
DOMAIN: [
|
||||
return_integration_config(
|
||||
sensors=[
|
||||
# Lowercase selector matches uppercase <H1> tag
|
||||
{"select": ".current-version h1", "name": "HA version"},
|
||||
# Lowercase selector matches uppercase <TITLE> tag
|
||||
{"select": "title", "name": "Page Title"},
|
||||
]
|
||||
)
|
||||
]
|
||||
}
|
||||
|
||||
mocker = MockRestData("test_scrape_html5_with_xml_declaration")
|
||||
with patch(
|
||||
"homeassistant.components.rest.RestData",
|
||||
return_value=mocker,
|
||||
):
|
||||
assert await async_setup_component(hass, DOMAIN, config)
|
||||
await hass.async_block_till_done()
|
||||
|
||||
# Verify no Content-Type header is set (tests content-based detection)
|
||||
assert "Content-Type" not in mocker.headers
|
||||
|
||||
state = hass.states.get("sensor.ha_version")
|
||||
assert state.state == "Current Version: 2021.12.10"
|
||||
|
||||
title_state = hass.states.get("sensor.page_title")
|
||||
assert title_state.state == "Test Page"
|
||||
|
||||
assert "XMLParsedAsHTMLWarning" not in caplog.text
|
||||
|
||||
|
||||
async def test_scrape_sensor_value_template(hass: HomeAssistant) -> None:
|
||||
"""Test Scrape sensor with value template."""
|
||||
config = {
|
||||
|
||||
Reference in New Issue
Block a user