added cli interface - deadlink-crawler - [unmaintained] crawls a site to detect dead links

commit 7fef91a7208c5e7a2665f6355dc6439428e93b41
parent 8576e54b93f4c717c6b9c1e94fccc28d6230780e
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Thu, 24 Jan 2013 14:42:03 +0100

added cli interface

Diffstat:
M README.md  | 21 +++++++++++++++++++++
M crawler.py  | 25 ++++++++++++++++++++-----

2 files changed, 41 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
@@ -3,6 +3,27 @@ Deadlink crawler
 
 This is a small crawler searching your website for deadlinks.
 
+Via command line
+----------------
+
+There is a CLI interface to use the crawler. You **must** pass an URL as the starting point for crawling. This might be the home page of your website.
+
+Additional options available are:
+
+- `--restrict`: Pass a regular expression for restricting your URL to a subset of all websites it finds. Usually you will want to use something like `http://example.com/.*` for this.
+- `--wait`: Set some time for waiting in seconds between each URL opening.
+
+```bash
+# Crawl all subsites of http://stefan-koch.name/ for deadlinks (including external deadlinks)
+python2.7 crawler.py --wait 1 --restrict http://stefan-koch.name/.* http://stefan-koch.name/
+# Crawl all article pages of example.com for deadlinks. We assume that there are linked articles on the main page
+python2.7 crawler.py --restrict http://example.com/article/.+ http://example.com/
+```
+
+
+Using an instance of the class
+------------------------------
+
 You can use it by creating a new instance of the class and running the crawler. The crawler class supports different options.
 
 ```python
diff --git a/crawler.py b/crawler.py
@@ -4,6 +4,8 @@ import re
 import time
 import httplib
 
+import argparse
+
 # TODO: Do not apply wait time to external links
 
 class Crawler(object):
@@ -49,7 +51,7 @@ class Crawler(object):
 	def visit_url(self, url, found_via):
 		response = self.check_url(url, found_via)
 		
-		if response != None and self.url_match.search(url):
+		if response != None and not self.excluded(url):
 			self.collect_new_urls(url, response.read())
 	
 	def collect_new_urls(self, url, html):
@@ -93,8 +95,21 @@ class Crawler(object):
 		soup = BeautifulSoup(page)
 		return [link.get('href') for link in soup.findAll('a')]
 	
+	def excluded(self, url):
+		return self.url_match != None and not self.url_match.search(url)
+	
+
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(description="Search a website for deadlinks")
+	parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl")
+	parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain")
+	parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch")
+
+	args = parser.parse_args()
 
-c = Crawler("http://stefan-koch.name/")
-c.set_url_restrict("http://stefan-koch.name/.*")
-c.set_wait_time(1)
-c.crawl()
+	c = Crawler(args.url)
+	if args.restrict:
+		c.set_url_restrict(args.restrict)
+	if args.wait_time:
+		c.set_wait_time(args.wait_time)
+	c.crawl()

	deadlink-crawler [unmaintained] crawls a site to detect dead links
	Log \| Files \| Refs \| README

M	README.md	\|	21	+++++++++++++++++++++
M	crawler.py	\|	25	++++++++++++++++++++-----