I have written a web scraping script using Selenium to crawl blog content from multiple URLs. The script processes URLs in batches of 1000 and uses multithreading with the ThreadPoolExecutor to improve performance. It also handles graceful termination with signal handling to save progress in case of interruptions.
Key Features of the Code:
- Headless Chrome Driver: Configured for faster performance.
- Blocking Media Files: Prevents loading unnecessary resources like images and videos.
- Multithreading: Processes multiple URLs simultaneously to reduce execution time.
- Progress Saving: Saves intermediate results to a CSV file during execution and before exiting.
- Error Handling and Logging: Captures errors and logs details for debugging.
Issue:
Despite these optimizations, the execution time is still slower than expected when processing a large number of URLs. Each URL takes several seconds to fetch content, which adds up significantly for thousands of URLs.
Questions:
- How can I further reduce execution time for this multi-page crawling script?
- Are there any specific optimizations I can apply to improve Selenium's performance, especially when handling iframes and dynamic content?
import multiprocessing from concurrent.futures import ThreadPoolExecutor, as_completed from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time import pandas as pd import logging import random import signal import sys # log logging.basicConfig(filename='crawler.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # chrome driver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") def start_driver(): driver = webdriver.Chrome(options=chrome_options) driver.execute_cdp_cmd('Network.enable', {}) try: driver.execute_cdp_cmd('Network.setBlockedURLs', { "urls": ["*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.mp4", "*.avi", "*.mkv", "*.mov"] }) except Exception as e: logging.error(f"Error setting blocked URLs: {e}") return driver # scraping def scrap_blog_content(url): driver = start_driver() try: driver.get(url) WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, "iframe")) ) iframe = driver.find_element(By.CSS_SELECTOR, "iframe") driver.switch_to.frame(iframe) WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div.se-main-container")) ) content = driver.find_element(By.CSS_SELECTOR, "div.se-main-container").text time.sleep(random.uniform(1, 5)) return content except Exception as e: logging.error(f"Error while fetching content from {url}: {e}") return None finally: driver.quit() # thread def process_urls(urls): results = [] with ThreadPoolExecutor(max_workers=8) as executor: future_to_url = {executor.submit(crawl_blog_content, url): url for url in urls} for future in as_completed(future_to_url): url = future_to_url[future] try: content = future.result() if content: results.append((url, content)) logging.info(f"Successfully crawled: {url}") except Exception as exc: logging.error(f"Error fetching {url}: {exc}") return results # result global_results = [] output_file = 'contents_202101.csv' # temp save def save_progress(): if global_results: temp_df = pd.DataFrame(global_results, columns=['URL', 'Content']) temp_df.to_csv(output_file, index=False) logging.info(f"Progress saved with {len(global_results)} entries.") # exit def signal_handler(sig, frame): logging.info("Termination signal received. Saving progress...") save_progress() sys.exit(0) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) if __name__ == "__main__": input_file = 'url_202101.csv' urls_df = pd.read_csv(input_file) urls = urls_df['URL'].tolist() batch_size = 1000 # batch url_chunks = [urls[i:i + batch_size] for i in range(0, len(urls), batch_size)] for idx, chunk in enumerate(url_chunks): logging.info(f"Processing batch {idx + 1}/{len(url_chunks)}") results = process_urls(chunk) global_results.extend(results) save_progress() logging.info(f"Batch {idx + 1} saved with {len(global_results)} entries.") save_progress() logging.info(f"Final results saved to {output_file} with {len(global_results)} entries.")```