commit 8a130178da817297c47a7e5bedaad4fd5bfa38c9
parent f8d030404c1217829e34feeee2cd62fdcf99685e
Author: Stefan Koch <taikano@lavabit.com>
Date: Thu, 30 May 2013 10:27:34 -0700
Merge pull request #1 from janhoy/improvements
Several improvements
Diffstat:
A | CHANGES.md | | | 31 | +++++++++++++++++++++++++++++++ |
A | LICENSE.txt | | | 177 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | README.md | | | 32 | +++++++++++++++++++++++++++----- |
M | crawler.py | | | 190 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- |
M | frontier.py | | | 26 | +++++++++++++++++++++++--- |
5 files changed, 432 insertions(+), 24 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
@@ -0,0 +1,30 @@
+# Version history for deadlink-crawler
+
+## Version 1.1, May 2013
+This version was contributed by https://github.com/janhoy
+New features in this release:
+
+ * Explicitly chose the Apache Software License for the product, added file LICENSE.txt
+ * Added a CHANGES.md file to track changes.
+ * Prints intro section with current time, start URL and restritions
+ * Prints summary section with stats, timing and number of dead links found
+ * Exits with error level 2 if dead links found - nice for acting in a shell script
+ * Less verbose. By default it only prints pages crawled and deadlinks found, and a summary at the end
+ * option `silent`: Be completely silent. Only print summary at the end. Nice for piping to script.
+ * option `debug`: Be super-verbose, printing all links found on each page
+ * option `report40x`: Report only 404 as dead, not the other 40x errors
+ * option `exclude`: Exclude URLs matching the given regex from the crawl and deadlink-checking
+
+Bug fixes:
+
+ * Password protected pages were reported as dead. Now they are not (re-enable with `--report40x`)
+ * Redirects were given wrong URL and reported as dead. Now handling redirects correctly
+ * Catch UnicodeEncodeError when fetching and ignores the link
+ * Same URL with different fragments, e.g. http://example.com/foo#fragment only needs to be checked once
+
+## Version 1.0, February 2013
+This was the first release with initial features
+
+ * `restrict` option for limiting crawl
+ * `wait` option for slower crawling
+ * `politeness` option for playing nice with same host+
\ No newline at end of file
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,177 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
diff --git a/README.md b/README.md
@@ -1,12 +1,13 @@
Deadlink crawler
================
-This is a small crawler searching your website for deadlinks.
+This is a small crawler searching a website for deadlinks.
Dependencies
------------
-This program requires **BeautifulSoup**.
+This program requires **BeautifulSoup** which can be installed using e.g.:
+`sudo easy_install beautifulsoup`
Via command line
----------------
@@ -15,17 +16,30 @@ There is a CLI interface to use the crawler. You **must** pass an URL as the sta
Additional options available are:
-- `--restrict`: Pass a regular expression for restricting your URL to a subset of all websites it finds. Usually you will want to use something like `http://example.com/.*` for this.
-- `--wait`: Set some time for waiting in seconds between each URL opening.
+- `--restrict`: Restrict crawl to pages with URLs matching the given regular expression
+ - If not specified, defaults to all pages within the domain of the start URL
+- `--wait`: Time (s) to wait between each URL opening. Default=0
+- `--politeness`: Time to wait (s) between calling two URLs in the same domain. Default=1
+- `--exclude`: Exclude URLs matching the given regex from the crawl and from deadlink-checking
+- `--silent`: Turn off verbose output. Only print summary at the end.
+- `--debug`: Be super-verbose, printing all links found on each page
+- `--report40x`: Report only 404 as dead, not the other 40x errors
+Examples:
```bash
# Crawl all subsites of http://stefan-koch.name/ for deadlinks (including external deadlinks)
# Wait one second between opening each URL
-python2.7 crawler.py --wait 1 --restrict http://stefan-koch.name/.* http://stefan-koch.name/
+python2.7 crawler.py --wait 1 http://stefan-koch.name/
# Crawl all article pages of example.com for deadlinks.
# We assume that there are linked articles on the main page
python2.7 crawler.py --restrict http://example.com/article/.+ http://example.com/
+
+# Crawl all subdomains of example.com, with silent mode and reporting HTTP 40x as dead
+python2.7 crawler.py --silent --report40x --restrict http://.*\.example\.com/.* http://www.example.com/
+
+# Crawl example.com, excluding print pages and calendars
+python2.7 crawler.py --exclude print|calendar http://www.example.com/
```
@@ -50,3 +64,11 @@ c.set_wait_time(1)
# start the crawling process
c.crawl()
```
+
+License
+-------
+The crawler is licensed under the Apache Software License v2.0, see [LICENSE.txt](LICENSE.txt) for details
+
+Version history
+---------------
+See [CHANGES.md](CHANGES.md) for complete version history
diff --git a/crawler.py b/crawler.py
@@ -1,15 +1,35 @@
+"""
+Deadlink crawler - https://github.com/taikano/deadlink-crawler
+
+Copyright 2013- taikano and other contributors at GitHub
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
from BeautifulSoup import BeautifulSoup
+from time import gmtime, strftime, time
import urllib2, httplib, urlparse
import re
import time
-
+import sys
import argparse
-
+import socket
import frontier
class Crawler(object):
def __init__(self, init_url):
- init_domain = urlparse.urlparse(init_url).netloc
+ self.init_url = init_url
+ self.init_domain = urlparse.urlparse(init_url).netloc
# Manages our domains we want to visit or have visited
self.frontier = frontier.Frontier()
@@ -23,8 +43,27 @@ class Crawler(object):
# URLs)
self._url_match = None
+ # Regular expression for URLs we are interested in (our internal
+ # URLs)
+ self._exclude = None
+
# Timeout in seconds to wait, so that we do not kill our server
self._wait_time = 0
+
+ # Verbose
+ self._verbose = True
+
+ # Debug
+ self._debug = False
+
+ # Report 40x http codes as deadlinks
+ self._report40x = False
+
+ # For progress reporting
+ self._pages = 0
+ self._links = 0
+ self._via = 0
+ self._dead = 0
@property
def restrict(self):
@@ -35,6 +74,39 @@ class Crawler(object):
self._url_match = re.compile(url_match)
@property
+ def exclude(self):
+ return self._exclude
+
+ @exclude.setter
+ def exclude(self, exclude):
+ self._exclude = re.compile(exclude)
+
+ @property
+ def verbose(self):
+ return self._verbose
+
+ @verbose.setter
+ def verbose(self, verbose):
+ self._verbose = verbose
+
+ @property
+ def debug(self):
+ return self._debug
+
+ @debug.setter
+ def debug(self, debug):
+ self._verbose = debug
+ self._debug = debug
+
+ @property
+ def report40x(self):
+ return self._report40x
+
+ @report40x.setter
+ def report40x(self, report40x):
+ self._report40x = report40x
+
+ @property
def wait_time(self):
return self._wait_time
@@ -53,6 +125,13 @@ class Crawler(object):
self.frontier.polite_time = seconds
def crawl(self):
+ _starttime = time.time()
+ if self.restrict == None:
+ self.restrict = "http://%s.*" % self.init_domain
+
+ print "Deadlink-crawler version 1.1"
+ print "Starting crawl from URL %s at %s with restriction %s\n" % (self.init_url, strftime("%Y-%m-%d %H:%M:%S", gmtime()), "http://%s.*" % self.init_domain)
+
while len(self.frontier) > 0:
time.sleep(self.wait_time)
@@ -67,6 +146,17 @@ class Crawler(object):
continue
self.print_deadlinks(self.deadlinks)
+
+ _elapsed = time.time() - _starttime
+
+ print "\nSummary:\n--------"
+ print "Crawled %d pages and checked %d links in %s time." % (self._pages, self._links, strftime("%H:%M:%S", gmtime(_elapsed)))
+ print "Found a total of %d deadlinks in %d different pages" % (self._dead, self._via)
+
+ if len(self.deadlinks) == 0:
+ exit(0)
+ else:
+ exit(2)
def visit_url(self, url, found_via):
response = self.check_url(url, found_via)
@@ -77,29 +167,78 @@ class Crawler(object):
self.collect_new_urls(url, response.read())
def collect_new_urls(self, url, html):
- print("Fetching new URLs from: %s" % url)
+ if self._verbose:
+ print("Processing %s" % url)
+ # Keep track of how many of our site's pages we have crawled, and print status now and then
+ self._pages += 1
+ if self._pages % 100 == 0:
+ print >> sys.stderr, "Processed %s links from %s pages" % (self._links, self._pages)
+
try:
for page in self.extract_urls(html):
+ if page != None:
+ page = page.strip() # Handle some malformed links
page = urlparse.urljoin(url, page)
- print("adding page %s" % page)
- self.frontier.add(page, url)
+ if self._exclude != None and self._exclude.search(page):
+ if self._debug:
+ print "Not adding link %s to crawl backlog (excluded by --exclude rule)" % page
+ else:
+ if self.frontier.add(page, url):
+ if self._debug:
+ print("Adding link %s to crawl backlog" % page)
except UnicodeEncodeError:
pass
def check_url(self, url, found_via):
- print("Trying URL: %s" % url)
+ if self._exclude != None and self._exclude.search(url):
+ if self._debug:
+ print "Not checking URL %s (excluded by --exclude rule)" % url
+ return None
+ if self._debug:
+ print("Checking URL: %s" % url)
+
+ self._links += 1
request = urllib2.Request(url)
try:
- response = urllib2.urlopen(request, None, 10)
- except (urllib2.HTTPError, httplib.BadStatusLine):
+ response = urllib2.urlopen(request, timeout=10)
+ except urllib2.HTTPError as e:
# We receive an exception in case of 404
+ if (e.code == 403 or e.code == 401 or e.code == 407 or e.code == 415) and not self._report40x:
+ if self._debug:
+ print "Got HTTP %s - not adding to deadlinks list (control with --report40x=True)" % (e.code)
+ else:
+ if self._debug:
+ print "Got HTTP %s - Adding to deadlinks list" % (e.code)
+ self.add_to_deadlinks(url, found_via)
+ return None
+ except httplib.BadStatusLine:
+ if self._verbose:
+ print "Got Exception BadStatusLine for url %s - Adding to deadlinks list" % url
self.add_to_deadlinks(url, found_via)
+ return None
+ except UnicodeEncodeError:
+ if self._verbose:
+ print "Got UnicodeEncodeError for url %s, skipping" % url
+ return None
+ except urllib2.URLError as e:
+ if self._verbose:
+ print "Got URLError for page %s" % url
+ return None
+ except socket.timeout as e:
+ print type(e) #catched
+ if self._verbose:
+ print "Got timeout reading page %s, skipping" % url
return None
status = response.getcode()
+ redirurl = response.geturl()
+ if url != redirurl:
+ if self._debug:
+ print "Followed redirect from %s to %s" % (url, redirurl)
+ url = redirurl
if status != None and status >= 400:
self.add_to_deadlinks(url, found_via)
@@ -109,22 +248,29 @@ class Crawler(object):
self.deadlinks.setdefault(found_via, [])
self.deadlinks[found_via].append(url)
- print("Found new deadlink %s on %s" % (url, found_via))
+ self._dead += 1
+
+ if self._verbose:
+ print " Found deadlink: %s" % url
def extract_urls(self, page):
soup = BeautifulSoup(page)
return [link.get('href') for link in soup.findAll('a')]
def excluded(self, url):
- return self._url_match != None and not self._url_match.search(url)
+ outside = self._url_match != None and not self._url_match.search(url)
+ excluded = self._exclude != None and self._exclude.search(url)
+ if excluded and self._debug:
+ print "Not following URL %s which is excluded by --exclude rule" % url
+ return outside or excluded
def print_deadlinks(self, deadlinks):
if len(deadlinks) == 0:
- print("No deadlinks were found. Hooray!")
+ print("\nNo deadlinks were found. Hooray!")
else:
- print("The following deadlinks were found")
- print()
+ print("\nThe following deadlinks were found\n")
for via in deadlinks:
+ self._via += 1
print("%s" % via)
for target in deadlinks[via]:
print("\t%s" % target)
@@ -134,8 +280,12 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Search a website for deadlinks")
parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl")
parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain")
- parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch")
- parser.add_argument('--politeness', dest='polite_time', type=float, help="Set the time to wait between calling two URLs of the same domain")
+ parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch (default=0)")
+ parser.add_argument('--politeness', dest='polite_time', type=float, help="Set the time to wait between calling two URLs of the same domain (default=1)")
+ parser.add_argument('--exclude', dest='exclude', help="Exclude URLs matching the given regex from crawl and deadlink-checking")
+ parser.add_argument('--silent', dest='silent', action='store_true', default=False, help="Turn off verbose output")
+ parser.add_argument('--debug', dest='debug', action='store_true', default=False, help="Be super-verbose")
+ parser.add_argument('--report40x', dest='report40x', action='store_true', default=False, help="Report only 404 as dead, not the other 40x errors")
args = parser.parse_args()
@@ -146,4 +296,12 @@ if __name__ == "__main__":
c.wait_time = args.wait_time
if args.polite_time:
c.polite_time = args.polite_time
+ if args.silent:
+ c.verbose = not args.silent
+ if args.debug:
+ c.debug = args.debug
+ if args.report40x:
+ c.report40x = args.report40x
+ if args.exclude:
+ c.exclude = args.exclude
c.crawl()
diff --git a/frontier.py b/frontier.py
@@ -1,3 +1,21 @@
+"""
+Deadlink crawler - https://github.com/taikano/deadlink-crawler
+
+Copyright 2013- taikano and other contributors at GitHub
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
import urlparse
import time
import heapq
@@ -22,11 +40,13 @@ class Frontier(object):
def polite_time(self, seconds):
if seconds >= 0:
self._polite_time = seconds
-
- def add(self, url, found_via):
+
+ def add(self, url, found_via, defrag=True):
+ if defrag:
+ url, frag = urlparse.urldefrag(url)
if url in self.found:
return False
-
+
domain = urlparse.urlparse(url).netloc
# means this is the first URL in our set