fix gateshead_gov_uk

This commit is contained in:
5ila5
2024-04-28 15:05:06 +02:00
committed by 5ila5
parent 21c6ef168d
commit e457f11f3b

View File

@@ -1,11 +1,11 @@
from datetime import datetime, timedelta
import base64
import json
import re
from datetime import datetime
import requests
import re
import json
import base64
from bs4 import BeautifulSoup
from waste_collection_schedule import Collection # type: ignore[attr-defined]
from waste_collection_schedule import Collection # type: ignore[attr-defined]
TITLE = "Gateshead Council"
DESCRIPTION = "Source for gateshead.gov.uk services for Gateshead"
@@ -13,7 +13,8 @@ URL = "gateshead.gov.uk"
TEST_CASES = {
"Test_001": {"uprn": "100000077407"},
"Test_002": {"uprn": "100000058404"},
"Test_002": {"uprn": 100000058404},
"Test_003": {"uprn": 100000033887},
}
ICON_MAP = {
@@ -29,37 +30,48 @@ class Source:
def fetch(self):
session = requests.Session()
# Start a session
r = session.get("https://www.gateshead.gov.uk/article/3150/Bin-collection-day-checker")
r = session.get(
"https://www.gateshead.gov.uk/article/3150/Bin-collection-day-checker"
)
r.raise_for_status()
soup = BeautifulSoup(r.text, features="html.parser")
# Extract form submission url and form data
form_url = soup.find("form", attrs={"id": "BINCOLLECTIONCHECKER_FORM"})["action"]
pageSessionId = soup.find("input", attrs={"name": "BINCOLLECTIONCHECKER_PAGESESSIONID"})["value"]
sessionId = soup.find("input", attrs={"name": "BINCOLLECTIONCHECKER_SESSIONID"})["value"]
nonce = soup.find("input", attrs={"name": "BINCOLLECTIONCHECKER_NONCE"})["value"]
ticks = soup.find("input", attrs={"name": "BINCOLLECTIONCHECKER_ADDRESSSEARCH_TICKS"})["value"]
form_url = soup.find("form", attrs={"id": "BINCOLLECTIONCHECKER_FORM"})[
"action"
]
pageSessionId = soup.find(
"input", attrs={"name": "BINCOLLECTIONCHECKER_PAGESESSIONID"}
)["value"]
sessionId = soup.find(
"input", attrs={"name": "BINCOLLECTIONCHECKER_SESSIONID"}
)["value"]
nonce = soup.find("input", attrs={"name": "BINCOLLECTIONCHECKER_NONCE"})[
"value"
]
form_data = {
"BINCOLLECTIONCHECKER_PAGESESSIONID": pageSessionId,
"BINCOLLECTIONCHECKER_SESSIONID": sessionId,
"BINCOLLECTIONCHECKER_NONCE": nonce,
"BINCOLLECTIONCHECKER_ADDRESSSEARCH_TICKS": ticks,
# "BINCOLLECTIONCHECKER_ADDRESSSEARCH_TICKS": ticks,
"BINCOLLECTIONCHECKER_FORMACTION_NEXT": "BINCOLLECTIONCHECKER_ADDRESSSEARCH_NEXTBUTTON",
"BINCOLLECTIONCHECKER_ADDRESSSEARCH_UPRN": self._uprn,
"BINCOLLECTIONCHECKER_ADDRESSSEARCH_ADDRESSTEXT": " " # Not quite sure why this is need (can not be empty) maybe used if there are multiple matches=??? But UPRN should be unique???
"BINCOLLECTIONCHECKER_ADDRESSSEARCH_ADDRESSTEXT": " ", # Not quite sure why this is need (can not be empty) maybe used if there are multiple matches=??? But UPRN should be unique???
}
# Submit form
r = session.post(form_url, data=form_data)
r.raise_for_status()
# Extract encoded response data
soup = BeautifulSoup(r.text, features="html.parser")
pattern = re.compile(r"var BINCOLLECTIONCHECKERFormData = \"(.*?)\";$", re.MULTILINE | re.DOTALL)
pattern = re.compile(
r"var BINCOLLECTIONCHECKERFormData = \"(.*?)\";$", re.MULTILINE | re.DOTALL
)
script = soup.find("script", text=pattern)
response_data = pattern.search(script.text).group(1)
@@ -67,23 +79,43 @@ class Source:
# Decode base64 encoded response data and convert to JSON
decoded_data = base64.b64decode(response_data)
data = json.loads(decoded_data)
soup = BeautifulSoup(
data["HOUSEHOLDCOLLECTIONS_1"]["DISPLAYHOUSEHOLD"], features="html.parser"
)
# Extract entries
entries = []
for bin in data["HOUSEHOLDCOLLECTIONS_1"]["DISPLAYHOUSEHOLD2"]["propertyCollections"]["future"]:
dt = datetime.strptime(bin['collectionDate'], '%Y-%m-%dT%H:%M:%S.%fZ')
if dt.hour == 23: # Some collections are returned as 11pm night before the collection
dt += timedelta(hours=1)
date = dt.date()
types = bin['wasteTypeCode'].split("|")
month = None
for tr in soup.find_all("tr"):
month_th = tr.find("th", attrs={"colspan": "3"})
if month_th:
month = month_th.text.split(" ")[
0
] # split if month is followed by year (may happen in December, not sure)
continue
if not month:
continue
tds = tr.find_all("td")
if len(tds) != 3:
continue
day = tds[0].text
types = tds[2].text.split(" and ")
now = datetime.now()
dt = datetime.strptime(f"{now.year}-{month}-{day}", "%Y-%B-%d").date()
if dt.month in (1, 2, 3) and now.month in (
11,
12,
): # fix Dates for next year
dt = dt.replace(year=now.year + 1)
for type in types:
entries.append(
Collection(
date=date,
t=type,
date=dt,
t=type.strip(),
icon=ICON_MAP.get(type),
)
)
return entries