Add selenium to tsv example from bob. - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository HTML git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons DIR Log DIR Files DIR Refs DIR Tags --- DIR commit d2f3f8bf36e6d7b0d88f7d3e02353bcd87a93795 DIR parent a7cd0c547c792f74b7784cc0a8c806380a28ca2f HTML Author: Christoph Lohmann <20h@r-36.net> Date: Thu, 10 Aug 2023 16:14:57 +0200 Add selenium to tsv example from bob. Diffstat: A sfeed-atom/selenium_crawl_tsv.py | 118 +++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+), 0 deletions(-) --- DIR diff --git a/sfeed-atom/selenium_crawl_tsv.py b/sfeed-atom/selenium_crawl_tsv.py @@ -0,0 +1,118 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By + +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile + +import sys +from datetime import datetime + +def make_escape_content_trans(): + m = {} + for i in range(0, 32): + m[i] = "" + m[0x7f] = "" # DEL + # replace + m["\\"] = "\\\\" + m["\n"] = "\\n" + m["\t"] = "\\t" + + return str.maketrans(m) + +def make_escape_field_trans(): + m = {} + for i in range(0, 32): + m[i] = "" + m[0x7f] = "" # DEL + # replace + m["\n"] = " " + m["\t"] = " " + + return str.maketrans(m) + +escape_content_tbl = make_escape_content_trans() +escape_field_tbl = make_escape_field_trans() + +def escape_content(s): + return s.translate(escape_content_tbl).strip() + +def escape_field(s): + return s.translate(escape_field_tbl).strip() + +if len(sys.argv) > 1: + url = sys.argv[1] +else: + print("usage: <url>") + sys.exit(1) + +options = Options() +options.add_argument("--headless") + +# use existing profile: + +#options.add_argument("--profile") +#profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release" +# NOTE: must not be running at the same time. +#options.add_argument(profile_path) +#options.set_preference("profile", profile_path) + +# setup custom profile: +# JS disabled +options.set_preference("javascript.enabled", False) +# disable stylesheet +options.set_preference("permissions.default.stylesheet", 2) +# disable image loading +options.set_preference("permissions.default.image", 2) +# override user-agent. +#options.set_preference("general.useragent.override", "whatever you want") + +driver = webdriver.Firefox(options=options) + +# set timeouts +#driver.implicitly_wait(10) + +# get the page +driver.get(url) + +# print page title +#print(driver.title) + +#pagesource = driver.execute_script("return document.body.InnerHTML;") +#print(pagesource) +#print(driver.page_source) +#outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML") + +#outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTML") +#print(outer_html) + +# show all links on a page +#links = driver.find_elements(By.TAG_NAME, "a") +anchors = driver.find_elements(By.CSS_SELECTOR, "main a") +links = [] +for anchor in anchors: + href = anchor.get_attribute("href") + text = anchor.text + if len(href): + links.append({"href": href, "text": text}) + +for link in links: + driver.get(link["href"]) + + # parse timestamp. + time = driver.find_element(By.TAG_NAME, "time") + ts = datetime.strptime(time.text, "%Y-%m-%d") + ts = int(ts.timestamp()) + + content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("outerHTML") + title = driver.title + title = title.replace(" - Codemadness", "") + + # escape fields + content = escape_content(content) + title = escape_field(title) + link = escape_field(link["href"]) + + print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content)) + +driver.close() +driver.quit()