I am 12 days old into Python and web scraping and managed to write my first ever automation script. Please review my code and point out blunders If any.
What do I want to achieve?
I want to scrape all chapters of each Novel in each category and post it on a WordPress blog to test. Please point out anything that I missed, and is mandatory to run this script on the WordPress blog.
from requests import get from bs4 import BeautifulSoup import re r = get(site, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"}) soup = BeautifulSoup(r.text, "lxml") category = soup.findAll(class_="search-by-genre") # Getting all categories categories = [] for link in soup.findAll(href=re.compile(r'/category/\w+$')): print("Category:", link.text) category_link = link['href'] categories.append(category_link) # Getting all Novel Headers for category in categories: r = get(category_link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"}) soup = BeautifulSoup(r.text, "lxml") Novels_header = soup.findAll(class_="top-novel-header") # Getting Novels' Title and Link for Novel_names in Novels_header: print("Novel:", Novel_names.text.strip()) Novel_link = Novel_names.find('a')['href'] # Getting Novel's Info r = get(Novel_link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"}) soup = BeautifulSoup(r.text, "lxml") Novel_divs = soup.findAll(class_="chapter-chs") # Novel Chapters for articles in Novel_divs: article_ch = articles.findAll("a") for chapters in article_ch: ch = chapters["href"] # Getting article r = get(ch, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"}) soup = BeautifulSoup(r.content, "lxml") title = soup.find(class_="block-title") print(title.text.strip()) full_article = soup.find("div", {"class": "desc"}) # remove ads inside the text: for ads in full_article.select('center, small, a'): ads.extract() print(full_article.get_text(strip=True, separator='\n'))