deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit 7fef91a7208c5e7a2665f6355dc6439428e93b41
parent 8576e54b93f4c717c6b9c1e94fccc28d6230780e
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Thu, 24 Jan 2013 14:42:03 +0100

added cli interface

Diffstat:
MREADME.md | 21+++++++++++++++++++++
Mcrawler.py | 25++++++++++++++++++++-----
2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md @@ -3,6 +3,27 @@ Deadlink crawler This is a small crawler searching your website for deadlinks. +Via command line +---------------- + +There is a CLI interface to use the crawler. You **must** pass an URL as the starting point for crawling. This might be the home page of your website. + +Additional options available are: + +- `--restrict`: Pass a regular expression for restricting your URL to a subset of all websites it finds. Usually you will want to use something like `http://example.com/.*` for this. +- `--wait`: Set some time for waiting in seconds between each URL opening. + +```bash +# Crawl all subsites of http://stefan-koch.name/ for deadlinks (including external deadlinks) +python2.7 crawler.py --wait 1 --restrict http://stefan-koch.name/.* http://stefan-koch.name/ +# Crawl all article pages of example.com for deadlinks. We assume that there are linked articles on the main page +python2.7 crawler.py --restrict http://example.com/article/.+ http://example.com/ +``` + + +Using an instance of the class +------------------------------ + You can use it by creating a new instance of the class and running the crawler. The crawler class supports different options. ```python diff --git a/crawler.py b/crawler.py @@ -4,6 +4,8 @@ import re import time import httplib +import argparse + # TODO: Do not apply wait time to external links class Crawler(object): @@ -49,7 +51,7 @@ class Crawler(object): def visit_url(self, url, found_via): response = self.check_url(url, found_via) - if response != None and self.url_match.search(url): + if response != None and not self.excluded(url): self.collect_new_urls(url, response.read()) def collect_new_urls(self, url, html): @@ -93,8 +95,21 @@ class Crawler(object): soup = BeautifulSoup(page) return [link.get('href') for link in soup.findAll('a')] + def excluded(self, url): + return self.url_match != None and not self.url_match.search(url) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Search a website for deadlinks") + parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl") + parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain") + parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch") + + args = parser.parse_args() -c = Crawler("http://stefan-koch.name/") -c.set_url_restrict("http://stefan-koch.name/.*") -c.set_wait_time(1) -c.crawl() + c = Crawler(args.url) + if args.restrict: + c.set_url_restrict(args.restrict) + if args.wait_time: + c.set_wait_time(args.wait_time) + c.crawl()