Add example selenium script for the atom hackathon. - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository HTML git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons DIR Log DIR Files DIR Refs DIR Tags --- DIR commit a7cd0c547c792f74b7784cc0a8c806380a28ca2f DIR parent 2922c09dc4919dcea4ac331bbaa4e373ba4ccc4a HTML Author: Christoph Lohmann <20h@r-36.net> Date: Thu, 10 Aug 2023 16:10:01 +0200 Add example selenium script for the atom hackathon. Diffstat: A sfeed-atom/kvssachsen2atom | 121 +++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+), 0 deletions(-) --- DIR diff --git a/sfeed-atom/kvssachsen2atom b/sfeed-atom/kvssachsen2atom @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# coding=utf-8 +# +# Copy me if you can. +# by 20h +# + +import os +import sys +import getopt + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as chromeoptions +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from datetime import datetime +import pytz + +def usage(app): + app = os.path.basename(app) + sys.stderr.write("usage: %s [-h] URI\n" % (app)) + sys.exit(1) + +def main(args): + try: + opts, largs = getopt.getopt(args[1:], "h") + except getopt.GetoptError as err: + print(str(err)) + usage(args[0]) + + for o, a in opts: + if o == "-h": + usage(args[0]) + else: + assert False, "unhandled option" + + if len(largs) < 1: + usage(args[0]) + + link = largs[0] + + options = chromeoptions() + chromearguments = [ + "headless", + "no-sandbox", + "disable-extensions", + "disable-dev-shm-usage", + "start-maximized", + "window-size=1900,1080", + "disable-gpu" + ] + for carg in chromearguments: + options.add_argument(carg) + + driver = webdriver.Chrome(options=options) + driver.get(link) + + isnews = WebDriverWait(driver=driver, timeout=60).until( + EC.presence_of_element_located((By.XPATH, + "//div[@data-last-letter]") + ) + ) + newslist = driver.find_elements(By.XPATH, "//div[@data-filter-target=\"list\"]")[0] + + title = driver.find_elements(By.XPATH, "//meta[@property=\"og:title\"]")[0].get_attribute("content") + description = title + globaltags = "" + + print("""<?xml version="1.0" encoding="utf-8"?>""") + print("""<feed xmlns="http://www.w3.org/2005/Atom">""") + print("\t<title><![CDATA[%s]]></title>" % (title)) + print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description)) + print("\t<id>%s</id>" % (link)) + print("\t<link href=\"%s\" rel=\"self\" />" % (link)) + print("\t<link href=\"%s\" />" % (link)) + + utcnow = datetime.now(pytz.utc) + print("\t<updated>%s</updated>" % (utcnow.isoformat())) + + articles = newslist.find_elements(By.XPATH, "./div") + baselink = "/".join(link.split("/", 3)[:-1]) + for article in articles[::-1]: + link = article.find_elements(By.XPATH, "./a")[0] + plink = link.get_attribute("href") + if not plink.startswith("http"): + plink = "%s/%s" % (baselink, plink) + ptitle = link.get_attribute("data-title") + pcontent = article.text + pauthor = "sachsen@kvsachsen.de" + + # Normalize datetime. + updateds = article.find_elements(By.XPATH, ".//time")[0].text + try: + dtupdated = datetime.strptime(updateds, "%d.%m.%Y") + except ValueError: + continue + + dtupdated = dtupdated.replace(hour=12, minute=0,\ + second=0, tzinfo=pytz.utc) + if dtupdated.year > utcnow.year: + dtupdated = dtupdated.replace(year=utcnow.year) + pupdated = dtupdated + + print("\t<entry>") + print("\t\t<id>%s</id>" % (plink)) + print("\t\t<title><![CDATA[%s]]></title>" % (ptitle)) + print("\t\t<link href=\"%s\" />" % (plink)) + print("\t\t<author><name>%s</name></author>" % (pauthor)) + print("\t\t<updated>%s</updated>" % (pupdated.isoformat())) + print("\t\t<content><![CDATA[%s]]></content>" % (pcontent)) + print("\t</entry>") + + print("</feed>") + + return 0 + +if __name__ == "__main__": + sys.exit(main(sys.argv)) +