kvssachsen2atom - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository HTML git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons DIR Log DIR Files DIR Refs DIR Tags --- kvssachsen2atom (3188B) --- 1 #!/usr/bin/env python 2 # coding=utf-8 3 # 4 # Copy me if you can. 5 # by 20h 6 # 7 8 import os 9 import sys 10 import getopt 11 12 from selenium import webdriver 13 from selenium.webdriver.chrome.options import Options as chromeoptions 14 from selenium.webdriver.support.ui import WebDriverWait 15 from selenium.webdriver.support import expected_conditions as EC 16 from selenium.webdriver.common.by import By 17 18 from datetime import datetime 19 import pytz 20 21 def usage(app): 22 app = os.path.basename(app) 23 sys.stderr.write("usage: %s [-h] URI\n" % (app)) 24 sys.exit(1) 25 26 def main(args): 27 try: 28 opts, largs = getopt.getopt(args[1:], "h") 29 except getopt.GetoptError as err: 30 print(str(err)) 31 usage(args[0]) 32 33 for o, a in opts: 34 if o == "-h": 35 usage(args[0]) 36 else: 37 assert False, "unhandled option" 38 39 if len(largs) < 1: 40 usage(args[0]) 41 42 link = largs[0] 43 44 options = chromeoptions() 45 chromearguments = [ 46 "headless", 47 "no-sandbox", 48 "disable-extensions", 49 "disable-dev-shm-usage", 50 "start-maximized", 51 "window-size=1900,1080", 52 "disable-gpu" 53 ] 54 for carg in chromearguments: 55 options.add_argument(carg) 56 57 driver = webdriver.Chrome(options=options) 58 driver.get(link) 59 60 isnews = WebDriverWait(driver=driver, timeout=60).until( 61 EC.presence_of_element_located((By.XPATH, 62 "//div[@data-last-letter]") 63 ) 64 ) 65 newslist = driver.find_elements(By.XPATH, "//div[@data-filter-target=\"list\"]")[0] 66 67 title = driver.find_elements(By.XPATH, "//meta[@property=\"og:title\"]")[0].get_attribute("content") 68 description = title 69 globaltags = "" 70 71 print("""<?xml version="1.0" encoding="utf-8"?>""") 72 print("""<feed xmlns="http://www.w3.org/2005/Atom">""") 73 print("\t<title><![CDATA[%s]]></title>" % (title)) 74 print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description)) 75 print("\t<id>%s</id>" % (link)) 76 print("\t<link href=\"%s\" rel=\"self\" />" % (link)) 77 print("\t<link href=\"%s\" />" % (link)) 78 79 utcnow = datetime.now(pytz.utc) 80 print("\t<updated>%s</updated>" % (utcnow.isoformat())) 81 82 articles = newslist.find_elements(By.XPATH, "./div") 83 baselink = "/".join(link.split("/", 3)[:-1]) 84 for article in articles[::-1]: 85 link = article.find_elements(By.XPATH, "./a")[0] 86 plink = link.get_attribute("href") 87 if not plink.startswith("http"): 88 plink = "%s/%s" % (baselink, plink) 89 ptitle = link.get_attribute("data-title") 90 pcontent = article.text 91 pauthor = "sachsen@kvsachsen.de" 92 93 # Normalize datetime. 94 updateds = article.find_elements(By.XPATH, ".//time")[0].text 95 try: 96 dtupdated = datetime.strptime(updateds, "%d.%m.%Y") 97 except ValueError: 98 continue 99 100 dtupdated = dtupdated.replace(hour=12, minute=0,\ 101 second=0, tzinfo=pytz.utc) 102 if dtupdated.year > utcnow.year: 103 dtupdated = dtupdated.replace(year=utcnow.year) 104 pupdated = dtupdated 105 106 print("\t<entry>") 107 print("\t\t<id>%s</id>" % (plink)) 108 print("\t\t<title><![CDATA[%s]]></title>" % (ptitle)) 109 print("\t\t<link href=\"%s\" />" % (plink)) 110 print("\t\t<author><name>%s</name></author>" % (pauthor)) 111 print("\t\t<updated>%s</updated>" % (pupdated.isoformat())) 112 print("\t\t<content><![CDATA[%s]]></content>" % (pcontent)) 113 print("\t</entry>") 114 115 print("</feed>") 116 117 return 0 118 119 if __name__ == "__main__": 120 sys.exit(main(sys.argv)) 121