I've been working at speeding up my web scraping with the asyncio
library. I have a working solution, but am unsure as to how pythonic it is or if I am properly using the library. Any input would be appreciated.
import aiohttp import asyncio import requests from lxml import etree @asyncio.coroutine def get(*args, **kwargs): """ A wrapper method for aiohttp's get method. Taken from Georges Dubus' article at http://compiletoi.net/fast-scraping-in-python-with-asyncio.html """ response = yield from aiohttp.request('GET', *args, **kwargs) return (yield from response.read_and_close()) @asyncio.coroutine def extract_text(url): """ Given the url for a chapter, extract the relevant text from it :param url: the url for the chapter to scrape :return: a string containing the chapter's text """ sem = asyncio.Semaphore(5) with (yield from sem): page = yield from get(url) tree = etree.HTML(page) paragraphs = tree.findall('.//*/div[@class="entry-content"]/p')[1: -1] return b'\n'.join(etree.tostring(paragraph) for paragraph in paragraphs) def generate_links(): """ Generate the links to each of the chapters :return: A list of strings containing every url to visit """ start_url = 'https://twigserial.wordpress.com/' base_url = 'https://twigserial.wordpress.com/category/story/' tree = etree.HTML(requests.get(start_url).text) xpath = './/*/option[@class="level-2"]/text()' return [base_url + suffix.strip() for suffix in tree.xpath(xpath)] @asyncio.coroutine def run(): links = generate_links() chapters = [] for f in asyncio.as_completed([extract_text(link) for link in links]): result = yield from f chapters.append(result) return chapters def main(): loop = asyncio.get_event_loop() chapters = loop.run_until_complete(run()) print(len(chapters)) if __name__ == '__main__': main()