URI: 
       selenium_crawl_tsv.py - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
  HTML git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons
   DIR Log
   DIR Files
   DIR Refs
   DIR Tags
       ---
       selenium_crawl_tsv.py (3006B)
       ---
            1 from selenium import webdriver
            2 from selenium.webdriver.common.by import By
            3 
            4 from selenium.webdriver.firefox.options import Options
            5 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
            6 
            7 import sys
            8 from datetime import datetime
            9 
           10 def make_escape_content_trans():
           11     m = {}
           12     for i in range(0, 32):
           13         m[i] = ""
           14     m[0x7f] = "" # DEL
           15     # replace
           16     m["\\"] = "\\\\"
           17     m["\n"] = "\\n"
           18     m["\t"] = "\\t"
           19 
           20     return str.maketrans(m)
           21 
           22 def make_escape_field_trans():
           23     m = {}
           24     for i in range(0, 32):
           25         m[i] = ""
           26     m[0x7f] = "" # DEL
           27     # replace
           28     m["\n"] = " "
           29     m["\t"] = " "
           30 
           31     return str.maketrans(m)
           32 
           33 escape_content_tbl = make_escape_content_trans()
           34 escape_field_tbl = make_escape_field_trans()
           35 
           36 def escape_content(s):
           37     return s.translate(escape_content_tbl).strip()
           38 
           39 def escape_field(s):
           40     return s.translate(escape_field_tbl).strip()
           41 
           42 if len(sys.argv) > 1:
           43     url = sys.argv[1]
           44 else:
           45     print("usage: <url>")
           46     sys.exit(1)
           47 
           48 options = Options()
           49 options.add_argument("--headless")
           50 
           51 # use existing profile:
           52 
           53 #options.add_argument("--profile")
           54 #profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release"
           55 # NOTE: must not be running at the same time.
           56 #options.add_argument(profile_path)
           57 #options.set_preference("profile", profile_path)
           58 
           59 # setup custom profile:
           60 # JS disabled
           61 options.set_preference("javascript.enabled", False)
           62 # disable stylesheet
           63 options.set_preference("permissions.default.stylesheet", 2)
           64 # disable image loading
           65 options.set_preference("permissions.default.image", 2)
           66 # override user-agent.
           67 #options.set_preference("general.useragent.override", "whatever you want")
           68 
           69 driver = webdriver.Firefox(options=options)
           70 
           71 # set timeouts
           72 #driver.implicitly_wait(10)
           73 
           74 # get the page
           75 driver.get(url)
           76 
           77 # print page title
           78 #print(driver.title)
           79 
           80 #pagesource = driver.execute_script("return document.body.InnerHTML;")
           81 #print(pagesource)
           82 #print(driver.page_source)
           83 #outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML")
           84 
           85 #outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTML")
           86 #print(outer_html)
           87 
           88 # show all links on a page
           89 #links = driver.find_elements(By.TAG_NAME, "a")
           90 anchors = driver.find_elements(By.CSS_SELECTOR, "main a")
           91 links = []
           92 for anchor in anchors:
           93     href = anchor.get_attribute("href")
           94     text = anchor.text
           95     if len(href):
           96        links.append({"href": href, "text": text})
           97 
           98 for link in links:
           99     driver.get(link["href"])
          100 
          101     # parse timestamp.
          102     time = driver.find_element(By.TAG_NAME, "time")
          103     ts = datetime.strptime(time.text, "%Y-%m-%d")
          104     ts = int(ts.timestamp())
          105 
          106     content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("outerHTML")
          107     title = driver.title
          108     title = title.replace(" - Codemadness", "")
          109 
          110     # escape fields
          111     content = escape_content(content)
          112     title = escape_field(title)
          113     link = escape_field(link["href"])
          114 
          115     print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content))
          116 
          117 driver.close()
          118 driver.quit()