commit 7fef91a7208c5e7a2665f6355dc6439428e93b41
parent 8576e54b93f4c717c6b9c1e94fccc28d6230780e
Author: Stefan <stefan@eliteinformatiker.de>
Date: Thu, 24 Jan 2013 14:42:03 +0100
added cli interface
Diffstat:
2 files changed, 41 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
@@ -3,6 +3,27 @@ Deadlink crawler
This is a small crawler searching your website for deadlinks.
+Via command line
+----------------
+
+There is a CLI interface to use the crawler. You **must** pass an URL as the starting point for crawling. This might be the home page of your website.
+
+Additional options available are:
+
+- `--restrict`: Pass a regular expression for restricting your URL to a subset of all websites it finds. Usually you will want to use something like `http://example.com/.*` for this.
+- `--wait`: Set some time for waiting in seconds between each URL opening.
+
+```bash
+# Crawl all subsites of http://stefan-koch.name/ for deadlinks (including external deadlinks)
+python2.7 crawler.py --wait 1 --restrict http://stefan-koch.name/.* http://stefan-koch.name/
+# Crawl all article pages of example.com for deadlinks. We assume that there are linked articles on the main page
+python2.7 crawler.py --restrict http://example.com/article/.+ http://example.com/
+```
+
+
+Using an instance of the class
+------------------------------
+
You can use it by creating a new instance of the class and running the crawler. The crawler class supports different options.
```python
diff --git a/crawler.py b/crawler.py
@@ -4,6 +4,8 @@ import re
import time
import httplib
+import argparse
+
# TODO: Do not apply wait time to external links
class Crawler(object):
@@ -49,7 +51,7 @@ class Crawler(object):
def visit_url(self, url, found_via):
response = self.check_url(url, found_via)
- if response != None and self.url_match.search(url):
+ if response != None and not self.excluded(url):
self.collect_new_urls(url, response.read())
def collect_new_urls(self, url, html):
@@ -93,8 +95,21 @@ class Crawler(object):
soup = BeautifulSoup(page)
return [link.get('href') for link in soup.findAll('a')]
+ def excluded(self, url):
+ return self.url_match != None and not self.url_match.search(url)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Search a website for deadlinks")
+ parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl")
+ parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain")
+ parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch")
+
+ args = parser.parse_args()
-c = Crawler("http://stefan-koch.name/")
-c.set_url_restrict("http://stefan-koch.name/.*")
-c.set_wait_time(1)
-c.crawl()
+ c = Crawler(args.url)
+ if args.restrict:
+ c.set_url_restrict(args.restrict)
+ if args.wait_time:
+ c.set_wait_time(args.wait_time)
+ c.crawl()