I use asyncio to speed up web scraping. I collect only title, author, tags, datetime, total comments from list view from specific website. Also, i collect these from all pages. I would like to improve my code, so any idea i would appreciate it.
My code:
from bs4 import BeautifulSoup, Tag from dataclasses import dataclass from typing import List import aiohttp import asyncio from functools import reduce from operator import iconcat @dataclass class Article: title: str author: str tags: str upload_on: str comments: int link: str @classmethod def from_page_items(cls, item: Tag) -> 'Article': spans = item.find('div', {'class': 'entry__header'}).find_all('span') entry_title = item.find('h2', {'class': 'entry__title'}) anchor = item.find('div', {'class': 'entry__header'}).find_all('a') return cls( title=entry_title.text.strip(), author=anchor[1].text, tags=anchor[2].text, upload_on=spans[0].text, comments=int(spans[1].text) if len(spans) > 1 else 0, link=entry_title.find('a').get('href') ) class Scrape: def __init__(self, url) -> None: self.session = None self.url = url async def __aenter__(self): self.session = aiohttp.ClientSession() return self async def __aexit__(self, *args): await self.session.close() async def fetch_url(self, params: dict = {}) -> BeautifulSoup: """Fetch a url and return HTML document Args: params (dict, optional): [description]. Defaults to {}. Returns: BeautifulSoup: HTML document """ async with self.session.get(self.url, params=params) as response: response.raise_for_status() resp_text = await response.text() soup = BeautifulSoup(resp_text, 'html.parser') return soup async def get_page_articles(self, page: int) -> List[Article]: """For each page return all the articles in a list contain details with Article Class Args: page (int): the number of page Returns: List[Article]: List of Article """ doc = await self.fetch_url(params={'p': page}) articles = [Article.from_page_items(article) for article in doc.findAll( 'article', {'class': 'entry card post-list'})] await asyncio.sleep(1) return articles async def gather_articles(self) -> List[List[Article]]: """Gather all pages until the end of pagination `end_page` Returns: List[List[Article]] """ doc = await self.fetch_url(self.url) end_page_number = doc.select_one( 'ul.pagination li:last-child').find('a')['href'].split('=')[-1] coros = [self.get_page_articles(page) for page in range(1, end_page_number + 1)] return await asyncio.gather(*coros) async def get_all_articles(self) -> List[Article]: """Gather all articles and transform to List[Article] Returns: List[Article] """ result = await self.gather_articles() return reduce(iconcat, result, []) async def main(): async with Scrape(BASE_URL) as scrape: result = await scrape.get_all_articles() print(result) asyncio.run(main())
After that, using this code, i will store all the info into a database and play with pandas library.