"""Extract series information from the IMDB web page."""
from __future__ import annotations
import argparse
from contextlib import suppress
import csv
from importlib import metadata
from pathlib import Path
import re
from rich.console import Console
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from .unicodewriter import UnicodeWriter
__date__ = "2024/08/03 22:11:35 hoel"
__author__ = "Berthold Höllmann"
__copyright__ = "Copyright © 2011,2013,2023 by Berthold Höllmann"
__credits__ = ["Berthold Höllmann"]
__maintainer__ = "Berthold Höllmann"
__email__ = "berhoel@gmail.com"
EPISODE_HEADER = re.compile(r"Season (?P<season>\d+), Episode (?P<episode>\d+):")
CONSOLE = Console()
[docs]
class IMDBEntry:
"""Handle IMDB episodes information for TV series."""
line = 2
TITLE_MATCH = re.compile(r"S(?P<season>\d+)\.E(?P<episode>\d+) ∙ (?P<title>.+)")
[docs]
def __init__(self, season: int, href_text: str, href: str, desc: str):
"""Initialize class instance."""
self.season = season
_match = self.TITLE_MATCH.match(href_text)
if _match is None:
msg = "Season info not found."
raise ValueError(msg)
if int(_match.group("season")) != self.season:
msg = "Current season does not fit html data."
raise ValueError(msg)
self.episode = int(_match.group("episode"))
self.title = _match.group("title")
self.url = href
self.url = self.url[: self.url.rfind("?")]
self.descr = desc
[docs]
def list(self) -> tuple:
"""Return information list."""
IMDBEntry.line += 1
return (
None,
self.season,
self.episode,
None,
None,
None,
f'=HYPERLINK("{self.url}";"{self.title}")',
self.descr,
f"=WENN(ODER(ISTLEER(F{IMDBEntry.line});"
f'ISTLEER(A{IMDBEntry.line}));"";'
f"SVERWEIS(F{IMDBEntry.line};F$3:J$10000;5;0)/"
f"SUMMENPRODUKT(((A$3:A$10000)>0)*((F$3:F$10000)="
f"F{IMDBEntry.line})))",
)
[docs]
def __lt__(self, other: object) -> bool:
"""Return if instance less than or equal other."""
if not isinstance(other, IMDBEntry):
raise TypeError
return self.episode < other.episode
[docs]
def __le__(self, other: object) -> bool:
"""Return if instance less than other."""
if not isinstance(other, IMDBEntry):
raise TypeError
return self.episode <= other.episode
[docs]
def __eq__(self, other: object) -> bool:
"""Return if instance is equal other."""
if not isinstance(other, IMDBEntry):
raise TypeError
return self.episode == other.episode
[docs]
def __ne__(self, other: object) -> bool:
"""Return if instance is not equal other."""
if not isinstance(other, IMDBEntry):
raise TypeError
return self.episode != other.episode
[docs]
def __gt__(self, other: object) -> bool:
"""Return if instance is greater than other."""
if not isinstance(other, IMDBEntry):
raise TypeError
return self.episode > other.episode
[docs]
def __ge__(self, other: object) -> bool:
"""Return if instance is greater or equal than other."""
if not isinstance(other, IMDBEntry):
raise TypeError
return self.episode >= other.episode
[docs]
class IMDBInfo:
"""Process html page from IMDB."""
[docs]
def __init__(self, args: argparse.Namespace):
"""Initialize instance."""
self.url = self.get_url(args.url[0])
options = Options()
options.add_argument("--headless")
firefox_profile = FirefoxProfile()
firefox_profile.set_preference(key="intl.accept_languages", value=args.language)
firefox_profile.set_preference(key="javascript.enabled", value=True)
options.profile = firefox_profile
self.driver = webdriver.Firefox(options=options)
self.series = " ".join(args.title)
if len(self.series) == 0:
self.series = self.get_series()
self.driver.get(self.url)
[docs]
def __del__(self) -> None:
"""Cleuup."""
self.driver.quit()
[docs]
def get_url(self, url: Path | str) -> str:
"""Return preprocessed URL."""
path = Path(url)
res = url
if path.is_file():
with path.open() as csvfile:
inforeader = csv.reader(csvfile)
res = next(inforeader)[0].split('"')[1]
if not isinstance(res, str):
raise TypeError
while res.endswith("/"):
res = res[:-1]
if not res.endswith("/episodes"):
res += "/episodes"
return res
[docs]
def __call__(self) -> None:
"""Process data."""
self.process_data()
[docs]
def process_data(self) -> None:
"""Generate the csv file."""
wait = WebDriverWait(self.driver, 10)
def extract_url(inp: str | None) -> str:
if inp is None:
raise ValueError
return inp.split("=")[-1]
seasons = [
int(extract_url(i.get_attribute("href")))
for i in wait.until(
ec.element_to_be_clickable(
(By.XPATH, '//div[contains(@class,"ipc-tabs--display-chip")]')
)
).find_elements(
By.XPATH, '//div[contains(@class,"ipc-tabs--display-chip")]//a'
)
]
tbl = "".maketrans("/", "_")
with Path(f"{self.series.strip().translate(tbl)}.csv").open("w") as filep:
writer = UnicodeWriter(filep, delimiter=";", quoting=csv.QUOTE_MINIMAL)
writer.writerow(
[f'=HYPERLINK("{self.url.strip()[:-9]}";"{self.series.strip()}")']
)
writer.writerow(
[
'=HYPERLINK("#Übersicht";"Datum")',
None,
None,
"Disk",
"Index",
"Diskset",
"IMDB URL",
None,
"=MITTELWERT(I3:I10000)",
"=SUMME(J3:J10000)",
]
)
for season in seasons:
self.driver.get(f"{self.url}/?season={season}")
wait = WebDriverWait(self.driver, 10)
CONSOLE.print(f"Season {season}")
elem = wait.until(
ec.element_to_be_clickable(
(
By.XPATH,
'//article[contains(@class,"episode-item-wrapper")]',
)
)
)
articles = elem.find_elements(
By.XPATH,
'//article[contains(@class,"episode-item-wrapper")]',
)
episodes = []
for article in articles:
description_ = article.find_elements(
By.XPATH, './/div[@class="ipc-html-content-inner-div"]'
)
description = description_[0].text if description_ else ""
link = article.find_element(
By.XPATH, './/a[@class="ipc-title-link-wrapper"]'
)
href = link.get_attribute("href")
if not isinstance(href, str):
raise TypeError
episodes.append(IMDBEntry(season, link.text, href, description))
episodes.sort()
for i in episodes:
writer.writerow(
[j.strip() if isinstance(j, str) else j for j in i.list()]
)
[docs]
def get_series(self) -> str:
"""Get Series title."""
url = self.url[:-9]
driver = self.driver
driver.get(url)
wait = WebDriverWait(driver, 10)
wrapper = wait.until(ec.presence_of_element_located((By.TAG_NAME, "h1")))
res = wrapper.find_element(
By.XPATH, '//h1[contains(@data-testid, "hero__pageTitle")]'
).text
with suppress(Exception):
res = wrapper.find_element(
By.XPATH,
'//div[contains(@data-testid, "hero-title-block__original-title")]',
).text.strip()
if res.endswith(" (original title)"):
res = res[:-17]
if res.startswith("Original title: "):
res = res[16:]
if res.startswith("Originaltitel: "):
res = res[15:]
return res.strip()
[docs]
def get_parser() -> argparse.ArgumentParser:
"""Create CLI parser."""
parser = argparse.ArgumentParser(
description="""Extract IMDB information for TV series. Generates file
<Title>.csv. If existing CSV file is given as argument, URL is read from
previously generated CSV file."""
)
parser.add_argument(
"url", metavar="URL", type=str, nargs=1, help="URL string / existing CSV file"
)
parser.add_argument(
"title", metavar="TITLE", type=str, nargs="*", help="title string", default=""
)
parser.add_argument(
"--version",
action="version",
version=f"%(prog)s {metadata.version('imdb_extract')}",
)
parser.add_argument(
"-l",
"--language",
type=str,
default="en",
help="Language to use for download, default: %(default)s",
)
return parser
[docs]
def main() -> None:
"""Main program."""
args = get_parser().parse_args()
prog = IMDBInfo(args)
CONSOLE.print(f"URL : {prog.url}")
CONSOLE.print(f"series: {prog.series}")
prog()
raise SystemExit
if __name__ == "__main__":
main()