crawler.py

crawler.py (plain text)

"""A simple web crawler that checks for dead links on a given website.
Recursively checks all links to pages hosted by the same site. Supports
multithreaded execution.

Usage: python3 crawler.py [-p <num_threads>] [-t <timeout>] <site>

The -p flag enables multithreading, with the given number of threads.
The -t flag specifies the request timeout in seconds; default is 5.

The crawler prints out URLs as they are being handled and reports bad links.
This printing is intentionally unsynchronized to demonstrate when URLs are
handled.

The crawler does not spoof its user agent, so links from sites such as Google
and Wikipedia, which reject crawlers, are reported as bad.
"""

from urllib.request import urlopen
from urllib.parse import urlparse
from urllib.error import HTTPError, URLError
from html.parser import HTMLParser
from threading import Thread, Lock
from queue import Queue
from time import time
from ucb import main
import socket
import sys

default_timeout = 5

#################
# URL Functions #
#################

def make_url(url, base):
    """Construct a full URL from the given URL fragment and base URL. Filters
    out non-http links.

    >>> make_url('../sp13', 'http://inst.eecs.berkeley.edu/~cs61a/fa12/')
    'http://inst.eecs.berkeley.edu/~cs61a/sp13'
    >>> make_url('http://espn.com', 'http://mlb.com/')
    'http://espn.com'
    >>> make_url('ftp://some-site.com', 'http://mlb.com/')
    """
    parsed = urlparse(url)
    scheme = parsed.scheme if parsed.scheme else 'http'
    netloc = parsed.netloc
    if scheme != 'http':
        return None
    elif not netloc:
        if not parsed.path:
            return None
        elif parsed.path[0] != '/':
            return simplify_url(base + parsed.path)
        netloc = urlparse(base).netloc
    return simplify_url(scheme + '://' + netloc + parsed.path)

def simplify_url(url):
    """Simplify a URL by processing .'s and ..'s, and replacing double slashes
    with single slashes.

    >>> simplify_url('http://inst.eecs.berkeley.edu/~cs61a/./sp13/projects/../..')
    'http://inst.eecs.berkeley.edu/~cs61a'
    """
    pieces = url.split('/')
    result = [pieces[0] + '/']
    for i in range(1, len(pieces)):
        piece = pieces[i]
        if piece == '..':
            result.pop()
        elif piece and piece != '.':
            result.append(piece)
    return '/'.join(result)

def get_base(url):
    """Extract the base directory of a URL.

    >>> get_base('http://inst.eecs.berkeley.edu/~cs61a/sp13/index.html')
    'http://inst.eecs.berkeley.edu/~cs61a/sp13/'
    """
    return url[:url.rindex('/')+1]

######################
# Parser and Crawler #
######################

class LinkParser(HTMLParser):
    """A parser that parses an HTML page for links, adding them to the Crawler
    associated with this parser."""
    def __init__(self, crawler):
        HTMLParser.__init__(self, False)
        self.crawler = crawler
        self.base = None
        self.page = None

    def reset_with_page(self, page):
        """Reset this parser for the given page."""
        self.reset()
        self.base = get_base(page)
        self.page = page

    def handle_starttag(self, tag, attrs):
        """Queue <a href=...> links found in the page in this parser's
        Crawler."""
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    self.crawler.queue_url(attr[1], self.base, self.page)


class Crawler(object):
    """A web crawler that processes links in a given website, recursively
    following all links on that site. This crawler supports parallel execution.
    Crawling is done by calling crawl with a page parser that queues new tasks
    in this Crawler."""
    def __init__(self, site, timeout, parallel=False):
        self.site = site
        self.timeout = timeout
        self.parallel = parallel
        self.queued = set() # set of URLs that have already been seen
        if parallel:
            # Synchronize access both to the set of seen URLs and the task queue
            self.queued_lock = Lock()
            self.queue = Queue()
        else:
            self.queue = []
        self.url_count = 0
        self.queue_url(site, site, None)

    def put_task(self, task):
        """Queue the given task in this Crawler."""
        if self.parallel:
            self.queue.put(task)
        else:
            self.queue.append(task)

    def get_task(self):
        """Retrieve a task from this Crawler. The caller should first check that
        tasks remain."""
        if self.parallel:
            return self.queue.get()
        else:
            return self.queue.pop()

    def task_done(self):
        """Inform the Crawler that a task has completed. This should be done
        every time a task is finished."""
        if self.parallel:
            self.queue.task_done()

    def all_done(self):
        """Check whether or not all tasks have completed."""
        if self.parallel:
            # No synchronization needed; unfinished_tasks will never hit 0
            # unless everything is done
            return self.queue.unfinished_tasks == 0
        else:
            return len(self.queue) == 0

    def unsynchronized_already_seen(self, url):
        """Check if a URL has already been seen, adding it to the set of seen
        URLs if not already there. Access to the set should be synchronized by
        the caller if necessary."""
        if not url or url in self.queued:
            return True
        self.queued.add(url)
        self.url_count += 1
        return False

    def already_seen(self, url):
        """Check if the given URL has already been seen. Locks access to the set
        of seen URLs if crawling is being done in parallel."""
        if self.parallel:
            with self.queued_lock: # lock access to set
                return self.unsynchronized_already_seen(url)
        else:
            return self.unsynchronized_already_seen(url)

    def queue_url(self, url, base, parent):
        """Queue the givn URL for reading, if it hasn't been seen before."""
        url = make_url(url, base) # construct and/or simplify the URL
        if self.already_seen(url):
            return

        # Only read the page if it is on this site and is HTML
        read = url.startswith(self.site)
        index = url.rindex('/')
        page = url[index+1:]
        index = page.rfind('.')
        if index >= 0:
            ext = page[index+1:]
            if ext != 'html' and ext != 'htm':
                read = False

        # Safely queue a new task to process the URL
        self.put_task((url, parent, read))

    def handle_url(self, url_info, parser):
        """Process the URL specified by url_info with the given parser. Messages
        produced by this method are intentionally unsynchronized."""
        url, parent, read = url_info
        print('handling:', url)

        # Request, but don't read the page
        try:
            opened = urlopen(url, timeout=self.timeout)
        except (HTTPError, URLError, socket.timeout) as e:
            print('bad link in {0}: {1}'.format(parent, url))
            print('error:', e)
            return

        if not read:
            return

        # Now read the page and send data to the parser
        parser.reset_with_page(opened.geturl())
        try:
            data = opened.read().decode()
            parser.feed(data)
        except Exception as e:
            print('error while reading {0}: {1}'.format(url, e))

    def crawl(self, parser):
        """Crawl the site with the given parser."""
        while not self.all_done():
            self.handle_url(self.get_task(), parser)
            self.task_done()
        
#################
# Crawl Masters #
#################

def serial_crawl(site, timeout, num_threads=1):
    """Crawl the given site sequentially for dead links. timeout is the request
    timeout in seconds. num_threads should always be 1."""
    assert num_threads == 1, 'serial_crawl cannot use multiple threads'

    crawler = Crawler(site, timeout)
    parser = LinkParser(crawler)

    start = time()
    crawler.crawl(parser)
    total = round(time() - start, 2)

    msg = 'serial crawl took {0} seconds, examined {1} urls'
    print(msg.format(total, crawler.url_count))


def parallel_crawl(site, timeout, num_threads=4):
    """Crawl the given site in parallel for dead links. timeout is the request
    timeout in seconds. num_threads is the number of threads to use for
    crawling."""
    crawler = Crawler(site, timeout, parallel=True)
    parsers = [LinkParser(crawler) for _ in range(num_threads)]
    threads = [Thread(target=crawler.crawl, args=(parsers[i],))
               for i in range(num_threads)]

    start = time()
    for t in threads:
        t.daemon = True # don't wait for spawned threads to exit
        t.start()
    crawler.queue.join() # wait for all tasks to be finished
    total = round(time() - start, 2)

    msg = 'parallel crawl took {0} seconds, examined {1} urls'
    print(msg.format(total, crawler.url_count))

##########################
# Command Line Interface #
##########################

@main
def run(*args):
    crawl, num_threads = serial_crawl, 1
    url, timeout = None, default_timeout
    i = 0
    while i < len(args):
        if args[i] == '-p':
            crawl = parallel_crawl
            num_threads = int(args[i+1])
        elif args[i] == '-t':
            timeout = int(args[i+1])
        elif args[i].startswith('http://'):
            if url:
                print('only one URL may be provided', file=sys.stderr)
                return
            url = args[i]
            i -= 1
        else:
            if args[i] != '-h' and args[i] != '-help':
                print('unknown argument:', args[i], file=sys.stderr)
            print('Options:\n' +
                  '  -p <num>     run with <num> threads\n' +
                  '  -t <num>     use <num> as the request timeout',
                  file=sys.stderr)
            return
        i += 2
    crawl(url, timeout, num_threads)