commit 29df2b54b4c9715e1be413a2ea011edfc6696dcf
parent 74551fb8b5e0e1854774726ded07b20690a85306
Author: Stefan Koch <misc@stefan-koch.name>
Date: Fri, 18 May 2018 19:59:52 +0200
update changes, readme and version number
Diffstat:
3 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,16 @@
# Version history for deadlink-crawler
+## Version 1.2, May 2018
+This release switches from Python2 to Python3
+
+ * changed code to python3 compability
+ * switch from 8 spaces to 4 spaces per indent
+ * follow both http and https URL in default restriction
+
+Bug fixes:
+
+ * display the correct restriction if a user defined restriction is used
+
## Version 1.1, May 2013
This version was contributed by https://github.com/janhoy
New features in this release:
@@ -27,4 +38,4 @@ This was the first release with initial features
* `restrict` option for limiting crawl
* `wait` option for slower crawling
- * `politeness` option for playing nice with same host-
\ No newline at end of file
+ * `politeness` option for playing nice with same host
diff --git a/README.md b/README.md
@@ -6,8 +6,14 @@ This is a small crawler searching a website for deadlinks.
Dependencies
------------
-This program requires **BeautifulSoup** which can be installed using e.g.:
-`sudo easy_install beautifulsoup`
+All dependencies are listed in the `requirements.txt` file. You can create an
+environment with:
+
+```bash
+virtualenv env
+source env/bin/activate
+pip install -r requirements.txt
+```
Via command line
----------------
@@ -29,17 +35,17 @@ Examples:
```bash
# Crawl all subsites of http://stefan-koch.name/ for deadlinks (including external deadlinks)
# Wait one second between opening each URL
-python2.7 crawler.py --wait 1 http://stefan-koch.name/
+python crawler.py --wait 1 http://stefan-koch.name/
# Crawl all article pages of example.com for deadlinks.
# We assume that there are linked articles on the main page
-python2.7 crawler.py --restrict http://example.com/article/.+ http://example.com/
+python crawler.py --restrict http://example.com/article/.+ http://example.com/
# Crawl all subdomains of example.com, with silent mode and reporting HTTP 40x as dead
-python2.7 crawler.py --silent --report40x --restrict http://.*\.example\.com/.* http://www.example.com/
+python crawler.py --silent --report40x --restrict http://.*\.example\.com/.* http://www.example.com/
# Crawl example.com, excluding print pages and calendars
-python2.7 crawler.py --exclude print|calendar http://www.example.com/
+python crawler.py --exclude print|calendar http://www.example.com/
```
diff --git a/crawler.py b/crawler.py
@@ -129,13 +129,13 @@ class Crawler(object):
def crawl(self):
_starttime = time.time()
if self.restrict is None:
- self.restrict = "http://%s.*" % self.init_domain
+ self.restrict = "https?://%s.*" % self.init_domain
- print("Deadlink-crawler version 1.1")
+ print("Deadlink-crawler version 1.2")
print("Starting crawl from URL %s at %s with restriction %s\n"
% (self.init_url,
time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
- "http://%s.*" % self.init_domain))
+ self.restrict.pattern))
while len(self.frontier) > 0:
time.sleep(self.wait_time)