selenium_crawl_tsv.py - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository HTML git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons DIR Log DIR Files DIR Refs DIR Tags --- selenium_crawl_tsv.py (3006B) --- 1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 4 from selenium.webdriver.firefox.options import Options 5 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile 6 7 import sys 8 from datetime import datetime 9 10 def make_escape_content_trans(): 11 m = {} 12 for i in range(0, 32): 13 m[i] = "" 14 m[0x7f] = "" # DEL 15 # replace 16 m["\\"] = "\\\\" 17 m["\n"] = "\\n" 18 m["\t"] = "\\t" 19 20 return str.maketrans(m) 21 22 def make_escape_field_trans(): 23 m = {} 24 for i in range(0, 32): 25 m[i] = "" 26 m[0x7f] = "" # DEL 27 # replace 28 m["\n"] = " " 29 m["\t"] = " " 30 31 return str.maketrans(m) 32 33 escape_content_tbl = make_escape_content_trans() 34 escape_field_tbl = make_escape_field_trans() 35 36 def escape_content(s): 37 return s.translate(escape_content_tbl).strip() 38 39 def escape_field(s): 40 return s.translate(escape_field_tbl).strip() 41 42 if len(sys.argv) > 1: 43 url = sys.argv[1] 44 else: 45 print("usage: <url>") 46 sys.exit(1) 47 48 options = Options() 49 options.add_argument("--headless") 50 51 # use existing profile: 52 53 #options.add_argument("--profile") 54 #profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release" 55 # NOTE: must not be running at the same time. 56 #options.add_argument(profile_path) 57 #options.set_preference("profile", profile_path) 58 59 # setup custom profile: 60 # JS disabled 61 options.set_preference("javascript.enabled", False) 62 # disable stylesheet 63 options.set_preference("permissions.default.stylesheet", 2) 64 # disable image loading 65 options.set_preference("permissions.default.image", 2) 66 # override user-agent. 67 #options.set_preference("general.useragent.override", "whatever you want") 68 69 driver = webdriver.Firefox(options=options) 70 71 # set timeouts 72 #driver.implicitly_wait(10) 73 74 # get the page 75 driver.get(url) 76 77 # print page title 78 #print(driver.title) 79 80 #pagesource = driver.execute_script("return document.body.InnerHTML;") 81 #print(pagesource) 82 #print(driver.page_source) 83 #outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML") 84 85 #outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTML") 86 #print(outer_html) 87 88 # show all links on a page 89 #links = driver.find_elements(By.TAG_NAME, "a") 90 anchors = driver.find_elements(By.CSS_SELECTOR, "main a") 91 links = [] 92 for anchor in anchors: 93 href = anchor.get_attribute("href") 94 text = anchor.text 95 if len(href): 96 links.append({"href": href, "text": text}) 97 98 for link in links: 99 driver.get(link["href"]) 100 101 # parse timestamp. 102 time = driver.find_element(By.TAG_NAME, "time") 103 ts = datetime.strptime(time.text, "%Y-%m-%d") 104 ts = int(ts.timestamp()) 105 106 content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("outerHTML") 107 title = driver.title 108 title = title.replace(" - Codemadness", "") 109 110 # escape fields 111 content = escape_content(content) 112 title = escape_field(title) 113 link = escape_field(link["href"]) 114 115 print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content)) 116 117 driver.close() 118 driver.quit()