This is a small web scraping project I made in 2 hours that targets the website remote.co . I am looking forward for improvements in my code. I know about the inconsistency with the WebDriverWait and time.sleep() waits, but when I used the WebDriverWait to wait until the load_more button was clickable and ran the program selenium crashed my webdriver window and continuously spammed my terminal window with 20-30 lines of seemingly useless text.
import scrapy from selenium import webdriver from selenium.common.exceptions import ElementNotInteractableException from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import ElementClickInterceptedException from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from time import sleep class ScrapeRemote(scrapy.Spider): name = 'jobs' start_urls = [f'https://remote.co/remote-jobs/search/?search_keywords={job_title}'] job_title = input('Enter your desired position: ').replace(' ', '+') def __init__(self): self.driver = webdriver.Chrome(r'C:\Users\leagu\chromedriver.exe') def parse(self, response): self.driver.get(response.url) try: load_more = WebDriverWait(self.driver, 10).until( EC.visibility_of_element_located((By.XPATH, '/html/body/main/div[2]/div/div[1]/div[3]/div/div/a')) ) except TimeoutException: self.log("Timeout - Couldn't load the page!") while True: try: sleep(1.5) load_more = self.driver.find_element_by_css_selector('a.load_more_jobs') load_more.click() except (ElementNotInteractableException, ElementClickInterceptedException): try: close_button = WebDriverWait(self.driver, 6).until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#om-oqulaezshgjig4mgnmcn-optin > div > button')) ) close_button.click() except TimeoutException: self.log('Reached Bottom Of The Page!') break selector = scrapy.selector.Selector(text=self.driver.page_source) listings = selector.css('li.job_listing').getall() for listing in listings: selector = scrapy.selector.Selector(text=listing) position = selector.css('div.position h3::text').get() company = selector.css('div.company strong::text').get() more_information = selector.css('a::attr(href)').get() yield { 'position': position, 'company': company, 'more_information': more_information } self.driver.close()