From 7986f93e2a2b5713d4d9bedd02ea4644bfc9f2e9 Mon Sep 17 00:00:00 2001 From: 5ila5 <5ila5@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:30:42 +0200 Subject: [PATCH] a_region_ch move to service file now using the ICS download --- .../service/A_region_ch.py | 183 ++++++++++++++++++ .../source/a_region_ch.py | 156 ++++----------- 2 files changed, 216 insertions(+), 123 deletions(-) create mode 100644 custom_components/waste_collection_schedule/waste_collection_schedule/service/A_region_ch.py diff --git a/custom_components/waste_collection_schedule/waste_collection_schedule/service/A_region_ch.py b/custom_components/waste_collection_schedule/waste_collection_schedule/service/A_region_ch.py new file mode 100644 index 00000000..74c75330 --- /dev/null +++ b/custom_components/waste_collection_schedule/waste_collection_schedule/service/A_region_ch.py @@ -0,0 +1,183 @@ +from typing import Literal + +import requests +from bs4 import BeautifulSoup +from waste_collection_schedule.source.ics import Source as ICS + +SERVICES = { + "winterthur": "https://m.winterthur.ch", + "a_region": "https://www.a-region.ch", +} +SERVICES_LITERALS = Literal["winterthur", "a_region"] + + +class A_region_ch: + def __init__( + self, + service: SERVICES_LITERALS, + region_url: str, + district: str | None = None, + regex: str | None = None, + ): + if service not in SERVICES: + raise Exception(f"service '{service}' not found") + self._base_url = SERVICES[service] + + self._regex = regex + + self._municipality_url = region_url + self._district = district + + def fetch(self) -> list[ICS]: + waste_types = self.get_waste_types(self._municipality_url) + + entries = [] + + for tour, link in waste_types.items(): + entries += self.get_ICS_sources(link, tour) + return entries + + def get_municipalities(self) -> dict[str, str]: + municipalities: dict[str, str] = {} + + # get PHPSESSID + session = requests.session() + r = session.get(f"{self._base_url}") + r.raise_for_status() + + # cookies = {'PHPSESSID': requests.utils.dict_from_cookiejar(r.cookies)['PHPSESSID']} + + params: dict[str, str | int] = {"apid": "13875680", "apparentid": "4618613"} + r = session.get(f"{self._base_url}/index.php", params=params) + r.raise_for_status() + self.extract_municipalities(r.text, municipalities) + + page = 1 + while True: + params = { + "do": "searchFetchMore", + "hash": "606ee79ca61fc6eef434ab4fca0d5956", + "p": page, + } + headers = { + "cookie": "PHPSESSID=71v67j0et4ih04qa142d402ebm;" + } # TODO: get cookie from first request + r = session.get( + f"{self._base_url}/appl/ajax/index.php", params=params, headers=headers + ) + r.raise_for_status() + if r.text == "": + break + self.extract_municipalities(r.text, municipalities) + page = page + 1 + return municipalities + + def extract_municipalities(self, text: str, municipalities: dict[str, str]): + soup = BeautifulSoup(text, features="html.parser") + downloads = soup.find_all("a", href=True) + for download in downloads: + # href ::= "/index.hp" + href = download.get("href") + if "ref=search" in href: + for title in download.find_all("div", class_="title"): + # title ::= "Abfallkalender Andwil" + municipalities[title.string.removeprefix("Abfallkalender ")] = href + + def get_waste_types(self, link: str) -> dict[str, str]: + if not link.startswith("http"): + link = f"{self._base_url}{link}" + r = requests.get(link) + r.raise_for_status() + + waste_types = {} + + soup = BeautifulSoup(r.text, features="html.parser") + downloads = soup.find_all("a", href=True) + for download in downloads: + # href ::= "/index.php?apid=12731252&apparentid=5011362" + href = download.get("href") + if download.find("div", class_="badgeIcon") or download.find( + "img", class_="rowImg" + ): + titles = download.find_all("div", class_="title") + if "PDF" in titles: + continue + titles = [title.string for title in titles] + if not titles: + titles = [download.get_text(strip=True)] + for title in titles: + # title ::= "Altmetall" + waste_types[title] = href + + return waste_types + + def get_ICS_sources(self, link: str, tour: str) -> list[ICS]: + if not link.startswith("http"): + link = f"{self._base_url}{link}" + r = requests.get(link) + r.raise_for_status() + + soup = BeautifulSoup(r.text, features="html.parser") + + # check for additional districts + districts = {} + downloads = soup.find_all("a", href=True) + for download in downloads: + href = download.get("href") + if "apparentid" in href: + title = download.find("div", class_="title") + if title is not None: + # additional district found -> + district_name_split = title.string.split(": ") + districts[ + district_name_split[1 if len(district_name_split) > 1 else 0] + ] = href + if len(districts) > 0: + if len(districts) == 1: + # only one district found -> use it + return self.get_ICS_sources(list(districts.values())[0], tour) + if self._district is None: + raise Exception("district is missing") + if self._district not in districts: + raise Exception(f"district '{self._district}' not found") + return self.get_ICS_sources(districts[self._district], tour) + + dates = list() + + downloads = soup.find_all("a", href=True) + for download in downloads: + # href ::= "/appl/ics.php?apid=12731252&from=2022-05-04%2013%3A00%3A00&to=2022-05-04%2013%3A00%3A00" + href = download.get("href") + if href.startswith("webcal") and "ical.php" in href: + dates.append(ICS(url=href, regex=self._regex)) + + return dates + + +def get_region_url_by_street( + service: SERVICES_LITERALS, + street: str, + search_url: str, + district: str | None = None, + regex: str | None = None, +) -> A_region_ch: + r = requests.get(search_url, params={"q": street}) + r.raise_for_status() + + soup = BeautifulSoup(r.text, features="html.parser") + as_ = soup.select("a") + if len(as_) == 0: + raise Exception("No streets found") + streets = [] + for a in as_: + href = a.get("href") + if not isinstance(href, str): + continue + streets.append(a.get_text(strip=True)) + + if a.get_text(strip=True).lower().replace(" ", "") == street.lower().replace( + " ", "" + ): + return A_region_ch(service, href, district, regex) + + raise Exception("Street not found, use one of") diff --git a/custom_components/waste_collection_schedule/waste_collection_schedule/source/a_region_ch.py b/custom_components/waste_collection_schedule/waste_collection_schedule/source/a_region_ch.py index a463800e..36fe3ca9 100644 --- a/custom_components/waste_collection_schedule/waste_collection_schedule/source/a_region_ch.py +++ b/custom_components/waste_collection_schedule/waste_collection_schedule/source/a_region_ch.py @@ -1,9 +1,5 @@ -import datetime -from urllib.parse import parse_qs, urlparse - -import requests -from bs4 import BeautifulSoup -from waste_collection_schedule import Collection # type: ignore[attr-defined] +from waste_collection_schedule import Collection +from waste_collection_schedule.service.A_region_ch import A_region_ch TITLE = "A-Region" DESCRIPTION = "Source for A-Region, Switzerland waste collection." @@ -21,7 +17,6 @@ TEST_CASES = { "Speicher": {"municipality": "Speicher"}, } -BASE_URL = "https://www.a-region.ch" MUNICIPALITIES = { "Andwil": "/index.php?ref=search&refid=13875680&apid=5011362", @@ -67,128 +62,43 @@ class Source: def __init__(self, municipality, district=None): self._municipality = municipality self._district = district + if municipality not in MUNICIPALITIES: + raise Exception(f"municipality '{municipality}' not found") + self._municipality_url = MUNICIPALITIES[municipality] - def fetch(self): - # municipalities = self.get_municipalities() - municipalities = MUNICIPALITIES - if self._municipality not in municipalities: - raise Exception(f"municipality '{self._municipality}' not found") + self._ics_sources = [] - waste_types = self.get_waste_types(municipalities[self._municipality]) + def _get_ics_sources(self): + self._ics_sources = A_region_ch( + "a_region", self._municipality_url, self._district + ).fetch() + + def fetch(self) -> list[Collection]: + fresh_sources = False + if not self._ics_sources: + fresh_sources = True + self._get_ics_sources() entries = [] - - for waste_type, link in waste_types.items(): - dates = self.get_dates(link) - - for d in dates: - entries.append(Collection(d, waste_type)) - + for source in self._ics_sources: + fresh_sources, e = self._get_dates(source, fresh_sources) + entries += e return entries - def get_municipalities(self): - municipalities = {} + def _get_dates(self, source, fresh=False) -> tuple[bool, list[Collection]]: + exception = None + try: + entries = source.fetch() + except Exception as e: + exception = e - # get PHPSESSID - session = requests.session() - r = session.get(f"{BASE_URL}") - r.raise_for_status() + if exception or not entries: + if fresh: + if exception: + raise exception + return fresh, [] - # cookies = {'PHPSESSID': requests.utils.dict_from_cookiejar(r.cookies)['PHPSESSID']} + self._get_ics_sources() + return self._get_dates(source, fresh=True) - params = {"apid": "13875680", "apparentid": "4618613"} - r = session.get(f"{BASE_URL}/index.php", params=params) - r.raise_for_status() - self.extract_municipalities(r.text, municipalities) - - page = 1 - while True: - params = { - "do": "searchFetchMore", - "hash": "606ee79ca61fc6eef434ab4fca0d5956", - "p": page, - } - headers = { - "cookie": "PHPSESSID=71v67j0et4ih04qa142d402ebm;" - } # TODO: get cookie from first request - r = session.get( - f"{BASE_URL}/appl/ajax/index.php", params=params, headers=headers - ) - r.raise_for_status() - if r.text == "": - break - self.extract_municipalities(r.text, municipalities) - page = page + 1 - return municipalities - - def extract_municipalities(self, text, municipalities): - soup = BeautifulSoup(text, features="html.parser") - downloads = soup.find_all("a", href=True) - for download in downloads: - # href ::= "/index.hp" - href = download.get("href") - if "ref=search" in href: - for title in download.find_all("div", class_="title"): - # title ::= "Abfallkalender Andwil" - municipalities[title.string.removeprefix("Abfallkalender ")] = href - - def get_waste_types(self, link): - r = requests.get(f"{BASE_URL}{link}") - r.raise_for_status() - - waste_types = {} - - soup = BeautifulSoup(r.text, features="html.parser") - downloads = soup.find_all("a", href=True) - for download in downloads: - # href ::= "/index.php?apid=12731252&apparentid=5011362" - href = download.get("href") - if "apparentid" in href: - for title in download.find_all("div", class_="title"): - # title ::= "Altmetall" - waste_types[title.string] = href - - return waste_types - - def get_dates(self, link): - r = requests.get(f"{BASE_URL}{link}") - r.raise_for_status() - - soup = BeautifulSoup(r.text, features="html.parser") - - # check for additional districts - districts = {} - downloads = soup.find_all("a", href=True) - for download in downloads: - href = download.get("href") - if "apparentid" in href: - title = download.find("div", class_="title") - if title is not None: - # additional district found -> - district_name_split = title.string.split(": ") - districts[ - district_name_split[1 if len(district_name_split) > 1 else 0] - ] = href - if len(districts) > 0: - if len(districts) == 1: - # only one district found -> use it - return self.get_dates(list(districts.values())[0]) - if self._district is None: - raise Exception("district is missing") - if self._district not in districts: - raise Exception(f"district '{self._district}' not found") - return self.get_dates(districts[self._district]) - - dates = set() - - downloads = soup.find_all("a", href=True) - for download in downloads: - # href ::= "/appl/ics.php?apid=12731252&from=2022-05-04%2013%3A00%3A00&to=2022-05-04%2013%3A00%3A00" - href = download.get("href") - if "ics.php" in href: - parsed = urlparse(href) - query = parse_qs(parsed.query) - date = datetime.datetime.strptime(query["from"][0], "%Y-%m-%d %H:%M:%S") - dates.add(date.date()) - - return dates + return fresh, entries