Create Simple Web Crawler Using aiohttp in Python

In this tutorial, you’ll learn how to create a web crawler using aiohttp in Python.

This asynchronous method allows concurrent web crawling.

 

 

Design Crawler Scope

Before diving into the code, you must define your crawler objectives and scope:

class WebCrawler:
    def __init__(self, start_url, max_depth=3, max_urls=100):
        self.start_url = start_url
        self.max_depth = max_depth
        self.max_urls = max_urls
        self.visited_urls = set()
        self.to_visit = [(start_url, 0)]  # (url, depth)
crawler = WebCrawler("https://en.wikipedia.org/wiki/Python_(programming_language)", max_depth=2, max_urls=50)
print(f"Crawler initialized with start URL: {crawler.start_url}")
print(f"Max depth: {crawler.max_depth}")
print(f"Max URLs to crawl: {crawler.max_urls}")

Output:

Crawler initialized with start URL: https://en.wikipedia.org/wiki/Python_(programming_language)
Max depth: 2
Max URLs to crawl: 50

This code sets up the basic structure for your web crawler, defining its starting point, maximum depth, and the maximum number of URLs to crawl.

The list of URLs that your crawler needs to visit is initialized with the to_visit attribute.

The visited_urls set is used to avoid crawling the same page multiple times and to keep track of visited URLs.

 

Parse HTML content

To parse HTML content, you’ll use the BeautifulSoup library. First, install it using pip:

pip install beautifulsoup4

Now, you can use BeautifulSoup to parse HTML content:

import aiohttp
import asyncio
from bs4 import BeautifulSoup
async def fetch_and_parse(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, 'html.parser')
            return soup
async def main():
    url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
    soup = await fetch_and_parse(url)
    print(soup.title.string)
asyncio.run(main())

Output:

Python (programming language) - Wikipedia

This code fetches the HTML content of a webpage asynchronously and parses it using BeautifulSoup.

Extract links and relevant data

You can extract links and other relevant data from the parsed HTML:

async def extract_links(soup):
    links = []
    for a_tag in soup.find_all('a', href=True):
        links.append(a_tag['href'])
    return links
async def main():
    url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
    soup = await fetch_and_parse(url)
    links = await extract_links(soup)
    print(f"Found {len(links)} links:")
    for link in links[:5]:  # Print first 5 links
        print(link)
asyncio.run(main())

Output:

Found 2424 links:
#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random

This code extracts all the links from the parsed HTML and prints the first 5.

 

Respect robots.txt

To parse robots.txt files, you can use the robotparser module:

from urllib import robotparser
import asyncio
async def get_robots_parser(url):
    rp = robotparser.RobotFileParser()
    rp.set_url(f"{url}/robots.txt")
    await asyncio.to_thread(rp.read)
    return rp
async def main():
    url = "https://en.wikipedia.org"
    rp = await get_robots_parser(url)
    print(f"Can fetch /trap/: {rp.can_fetch('*', f'{url}/trap/')}")
    print(f"Can fetch /wiki/: {rp.can_fetch('*', f'{url}/wiki/')}")
asyncio.run(main())

Output:

Can fetch /trap/: False
Can fetch /wiki/: True

This code parses the robots.txt file and checks if certain paths are allowed to be crawled.

Honor disallowed URLs

You can integrate the robots.txt parsing into your crawler:

async def crawl(url):
  async with aiohttp.ClientSession() as session:
      rp = await get_robots_parser(url)
      paths = ['/trap/', '/wiki/']      
      for path in paths:
          full_url = f"{url}{path}"
          if rp.can_fetch('*', full_url):
              print(f"Fetching allowed: {full_url}")
              content = await fetch(session, full_url)
              print(f"Content from {full_url[:50]}: {content[:100]}...")  # Print first 100 chars
          else:
              print(f"Fetching not allowed: {full_url}")
async def main():
  url = "https://en.wikipedia.org"
  await crawl(url)
asyncio.run(main())

Output:

Fetching not allowed: https://en.wikipedia.org/trap/
Fetching allowed: https://en.wikipedia.org/wiki/
Content from https://en.wikipedia.org/wiki/: <!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-la...

This code checks if a URL is allowed before crawling it.

 

Data Storage and Persistence

For this example, you can use SQLite as it’s built into Python and doesn’t require additional setup:

def init_db():
  conn = sqlite3.connect('crawled_pages.db')
  cursor = conn.cursor()
  cursor.execute('''
      CREATE TABLE IF NOT EXISTS pages (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          url TEXT UNIQUE,
          content TEXT
      )
  ''')
  conn.commit()
  conn.close()

# Store the page content in the database
def store_page(url, content):
  conn = sqlite3.connect('crawled_pages.db')
  cursor = conn.cursor()
  cursor.execute('''
      INSERT OR IGNORE INTO pages (url, content) VALUES (?, ?)
  ''', (url, content))
  conn.commit()
  conn.close()

async def fetch(session, url):
  async with session.get(url) as response:
      return await response.text()

async def fetch_and_parse(url):
  async with aiohttp.ClientSession() as session:
      html_content = await fetch(session, url)
      return BeautifulSoup(html_content, 'html.parser')
async def get_robots_parser(url):
  rp = robotparser.RobotFileParser()
  rp.set_url(f"{url}/robots.txt")
  await asyncio.to_thread(rp.read)
  return rp
async def extract_links(soup):
  links = []
  for a_tag in soup.find_all('a', href=True):
      href = a_tag['href']
      if href.startswith('/wiki/'):  # Only consider Wikipedia article links
          links.append(f"https://en.wikipedia.org{href}")
  return links
async def crawl(start_url, max_pages=10):
  rp = await get_robots_parser("https://en.wikipedia.org")
  to_visit = [start_url]
  visited = set()
  async with aiohttp.ClientSession() as session:
      while to_visit and len(visited) < max_pages:
          url = to_visit.pop(0)
          if url in visited:
              continue
          if rp.can_fetch('*', url):
              print(f"Fetching allowed: {url}")
              soup = await fetch_and_parse(url)
              content = str(soup)
              store_page(url, content)  # Store the page in the database
              visited.add(url)
              links = await extract_links(soup)
              print(f"Found {len(links)} links on {url}")
              for link in links:
                  if link not in visited and link not in to_visit:
                      to_visit.append(link)
          else:
              print(f"Fetching not allowed: {url}")
async def main():
  init_db()
  start_url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
  await crawl(start_url)
asyncio.run(main())

After running this code, you’ll have a SQLite database with the first 10 crawled links.

 

Depth and Breadth Control

You can implement both depth-first and breadth-first crawling by modifying how you manage the to_visit list:

from collections import deque
class WebCrawler:
    def __init__(self, start_url, max_depth=3, strategy='bfs'):
        self.start_url = start_url
        self.max_depth = max_depth
        self.strategy = strategy
        self.visited_urls = set()
        self.to_visit = deque([(start_url, 0)])  # (url, depth)

    async def crawl(self):
        while self.to_visit:
            url, depth = self.to_visit.popleft() if self.strategy == 'bfs' else self.to_visit.pop()
            if depth > self.max_depth:
                continue
            if url not in self.visited_urls:
                print(f"Crawling: {url} (depth: {depth})")
                self.visited_urls.add(url)
                # Simulate finding new URLs
                new_urls = [f"{url}/page{i}" for i in range(1, 4)]
                for new_url in new_urls:
                    self.to_visit.append((new_url, depth + 1))
async def main():
    bfs_crawler = WebCrawler("https://example.com", max_depth=2, strategy='bfs')
    await bfs_crawler.crawl()
    print("\nNow with DFS:")
    dfs_crawler = WebCrawler("https://example.com", max_depth=2, strategy='dfs')
    await dfs_crawler.crawl()
asyncio.run(main())

Output:

Crawling: https://example.com (depth: 0)
Crawling: https://example.com/page1 (depth: 1)
Crawling: https://example.com/page2 (depth: 1)
Crawling: https://example.com/page3 (depth: 1)
Crawling: https://example.com/page1/page1 (depth: 2)
Crawling: https://example.com/page1/page2 (depth: 2)
Crawling: https://example.com/page1/page3 (depth: 2)
Crawling: https://example.com/page2/page1 (depth: 2)
Crawling: https://example.com/page2/page2 (depth: 2)
Crawling: https://example.com/page2/page3 (depth: 2)
Crawling: https://example.com/page3/page1 (depth: 2)
Crawling: https://example.com/page3/page2 (depth: 2)
Crawling: https://example.com/page3/page3 (depth: 2)

Now with DFS:
Crawling: https://example.com (depth: 0)
Crawling: https://example.com/page3 (depth: 1)
Crawling: https://example.com/page3/page3 (depth: 2)
Crawling: https://example.com/page3/page2 (depth: 2)
Crawling: https://example.com/page3/page1 (depth: 2)
Crawling: https://example.com/page2 (depth: 1)
Crawling: https://example.com/page2/page3 (depth: 2)
Crawling: https://example.com/page2/page2 (depth: 2)
Crawling: https://example.com/page2/page1 (depth: 2)
Crawling: https://example.com/page1 (depth: 1)
Crawling: https://example.com/page1/page3 (depth: 2)
Crawling: https://example.com/page1/page2 (depth: 2)
Crawling: https://example.com/page1/page1 (depth: 2)

This code shows both breadth-first and depth-first crawling strategies.

 

User-agent Rotation

You can set and rotate user agents to random ones for each request to avoid detection as a bot:

import random
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]
async def fetch_with_user_agent(url):
    headers = {'User-Agent': random.choice(USER_AGENTS)}
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url) as response:
            print(f"Fetching {url} with User-Agent: {headers['User-Agent']}")
            return await response.text()
async def main():
    url = "https://httpbin.org/user-agent"
    content = await fetch_with_user_agent(url)
    print(content)
asyncio.run(main())

Output:

Fetching https://httpbin.org/user-agent with User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36
{
  "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
Leave a Reply

Your email address will not be published. Required fields are marked *