make python3 ready - deadlink-crawler - [unmaintained] crawls a site to detect dead links

commit 74551fb8b5e0e1854774726ded07b20690a85306
parent 8a130178da817297c47a7e5bedaad4fd5bfa38c9
Author: Stefan Koch <misc@stefan-koch.name>
Date:   Fri, 18 May 2018 19:52:00 +0200

make python3 ready

Diffstat:
M .gitignore  | 2 ++
M crawler.py  | 591 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M frontier.py  | 153 ++++++++++++++++++++++++++++++++++++++++---------------------------------------
A requirements.txt  | 1 +

4 files changed, 391 insertions(+), 356 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+env
+
 *.py[cod]
 
 # C extensions
diff --git a/crawler.py b/crawler.py
@@ -16,292 +16,323 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 
-from BeautifulSoup import BeautifulSoup
-from time import gmtime, strftime, time
-import urllib2, httplib, urlparse
-import re
+from bs4 import BeautifulSoup
 import time
-import sys
+import urllib.request
+import urllib.error
+import urllib.parse
+import http.client
+import re
 import argparse
 import socket
 import frontier
 
+
 class Crawler(object):
-	def __init__(self, init_url):
-		self.init_url = init_url
-		self.init_domain = urlparse.urlparse(init_url).netloc
-		
-		# Manages our domains we want to visit or have visited
-		self.frontier = frontier.Frontier()
-		self.frontier.add(init_url, None)
-		
-		# List of deadlinks for each URL we have,
-		# i.e. url1: [deadlink1, deadlink2]
-		self.deadlinks = {}
-		
-		# Regular expression for URLs we are interested in (our internal
-		# URLs)
-		self._url_match = None
-		
-		# Regular expression for URLs we are interested in (our internal
-		# URLs)
-		self._exclude = None
-
-		# Timeout in seconds to wait, so that we do not kill our server
-		self._wait_time = 0
-		
-		# Verbose
-		self._verbose = True
-		
-		# Debug
-		self._debug = False
-		
-		# Report 40x http codes as deadlinks
-		self._report40x = False
-		
-		# For progress reporting
-		self._pages = 0
-		self._links = 0
-		self._via = 0
-		self._dead = 0
-	
-	@property
-	def restrict(self):
-		return self._url_match
-	
-	@restrict.setter
-	def restrict(self, url_match):
-		self._url_match = re.compile(url_match)
-	
-	@property
-	def exclude(self):
-		return self._exclude
-	
-	@exclude.setter
-	def exclude(self, exclude):
-		self._exclude = re.compile(exclude)
-
-	@property
-	def verbose(self):
-		return self._verbose
-	
-	@verbose.setter
-	def verbose(self, verbose):
-		self._verbose = verbose
-
-	@property
-	def debug(self):
-		return self._debug
-	
-	@debug.setter
-	def debug(self, debug):
-		self._verbose = debug
-		self._debug = debug
-
-	@property
-	def report40x(self):
-		return self._report40x
-	
-	@report40x.setter
-	def report40x(self, report40x):
-		self._report40x = report40x
-
-	@property
-	def wait_time(self):
-		return self._wait_time
-	
-	@wait_time.setter
-	def wait_time(self, seconds):
-		if seconds >= 0:
-			self._wait_time = seconds
-	
-	@property
-	def polite_time(self):
-		return self.frontier.polite_time
-	
-	@polite_time.setter
-	def polite_time(self, seconds):
-		if seconds >= 0:
-			self.frontier.polite_time = seconds
-	
-	def crawl(self):
-		_starttime = time.time()
-		if self.restrict == None:
-			self.restrict = "http://%s.*" % self.init_domain
-
-		print "Deadlink-crawler version 1.1"
-		print "Starting crawl from URL %s at %s with restriction %s\n" % (self.init_url, strftime("%Y-%m-%d %H:%M:%S", gmtime()), "http://%s.*" % self.init_domain)
-
-		while len(self.frontier) > 0:
-			time.sleep(self.wait_time)
-			
-			next_time, next_url = self.frontier.next()
-			
-			while time.time() < next_time:
-				time.sleep(0.5)
-			
-			try:
-				self.visit_url(next_url[0], next_url[1])
-			except urllib2.URLError:
-				continue
-		
-		self.print_deadlinks(self.deadlinks)
-
-		_elapsed = time.time() - _starttime
-		
-		print "\nSummary:\n--------"
-		print "Crawled %d pages and checked %d links in %s time." % (self._pages, self._links, strftime("%H:%M:%S", gmtime(_elapsed)))
-		print "Found a total of %d deadlinks in %d different pages" % (self._dead, self._via)
-		
-		if len(self.deadlinks) == 0:
-			exit(0)
-		else:
-			exit(2)
-	
-	def visit_url(self, url, found_via):
-		response = self.check_url(url, found_via)
-		
-		self.frontier.notify_visit(url)
-			
-		if response != None and not self.excluded(url):
-			self.collect_new_urls(url, response.read())
-	
-	def collect_new_urls(self, url, html):
-		if self._verbose:
-			print("Processing %s" % url)
-		
-		# Keep track of how many of our site's pages we have crawled, and print status now and then
-		self._pages += 1
-		if self._pages % 100 == 0:
-			print >> sys.stderr, "Processed %s links from %s pages" % (self._links, self._pages)
-
-		try:
-			for page in self.extract_urls(html):
-				if page != None:
-					page = page.strip() # Handle some malformed links
-				page = urlparse.urljoin(url, page)
-				if self._exclude != None and self._exclude.search(page):
-					if self._debug:
-						print "Not adding link %s to crawl backlog (excluded by --exclude rule)" % page
-				else:
-					if self.frontier.add(page, url):
-						if self._debug:
-							print("Adding link %s to crawl backlog" % page)
-		except UnicodeEncodeError:
-			pass
-	
-	def check_url(self, url, found_via):
-		if self._exclude != None and self._exclude.search(url):
-			if self._debug:
-				print "Not checking URL %s (excluded by --exclude rule)" % url
-			return None
-		
-		if self._debug:
-			print("Checking URL: %s" % url)
-
-		self._links += 1
-		request = urllib2.Request(url)
-		
-		try:
-			response = urllib2.urlopen(request, timeout=10)
-		except urllib2.HTTPError as e:
-			# We receive an exception in case of 404
-			if (e.code == 403 or e.code == 401 or e.code == 407 or e.code == 415) and not self._report40x:
-				if self._debug:
-					print "Got HTTP %s - not adding to deadlinks list (control with --report40x=True)" % (e.code)
-			else:
-				if self._debug:
-					print "Got HTTP %s - Adding to deadlinks list" % (e.code)
-				self.add_to_deadlinks(url, found_via)
-			return None
-		except httplib.BadStatusLine:
-			if self._verbose:
-				print "Got Exception BadStatusLine for url %s - Adding to deadlinks list" % url
-			self.add_to_deadlinks(url, found_via)
-			return None		  
-		except UnicodeEncodeError:
-			if self._verbose:
-				print "Got UnicodeEncodeError for url %s, skipping" % url
-			return None
-		except urllib2.URLError as e:
-			if self._verbose:
-				print "Got URLError for page %s" % url
-			return None
-		except socket.timeout as e:
-			print type(e)    #catched
-			if self._verbose:
-				print "Got timeout reading page %s, skipping" % url
-			return None
-		
-		status = response.getcode()
-		redirurl = response.geturl()
-		if url != redirurl:
-			if self._debug:
-				print "Followed redirect from %s to %s" % (url, redirurl)
-			url = redirurl
-		if status != None and status >= 400:
-			self.add_to_deadlinks(url, found_via)
-		
-		return response
-	
-	def add_to_deadlinks(self, url, found_via):
-		self.deadlinks.setdefault(found_via, [])
-		self.deadlinks[found_via].append(url)
-		
-		self._dead += 1
-		
-		if self._verbose:
-			print "  Found deadlink: %s" % url
-	
-	def extract_urls(self, page):
-		soup = BeautifulSoup(page)
-		return [link.get('href') for link in soup.findAll('a')]
-	
-	def excluded(self, url):
-		outside = self._url_match != None and not self._url_match.search(url)
-		excluded = self._exclude != None and self._exclude.search(url)
-		if excluded and self._debug:
-			print "Not following URL %s which is excluded by --exclude rule" % url
-		return outside or excluded
-	
-	def print_deadlinks(self, deadlinks):
-		if len(deadlinks) == 0:
-			print("\nNo deadlinks were found. Hooray!")
-		else:
-			print("\nThe following deadlinks were found\n")
-			for via in deadlinks:
-				self._via += 1
-				print("%s" % via)
-				for target in deadlinks[via]:
-					print("\t%s" % target)
+    def __init__(self, init_url):
+        self.init_url = init_url
+        self.init_domain = urllib.parse.urlparse(init_url).netloc
+
+        # Manages our domains we want to visit or have visited
+        self.frontier = frontier.Frontier()
+        self.frontier.add(init_url, None)
+
+        # List of deadlinks for each URL we have,
+        # i.e. url1: [deadlink1, deadlink2]
+        self.deadlinks = {}
+
+        # Regular expression for URLs we are interested in (our internal
+        # URLs)
+        self._url_match = None
+
+        # Regular expression for URLs we are interested in (our internal
+        # URLs)
+        self._exclude = None
+
+        # Timeout in seconds to wait, so that we do not kill our server
+        self._wait_time = 0
+
+        # Verbose
+        self._verbose = True
+
+        # Debug
+        self._debug = False
+
+        # Report 40x http codes as deadlinks
+        self._report40x = False
+
+        # For progress reporting
+        self._pages = 0
+        self._links = 0
+        self._via = 0
+        self._dead = 0
+
+    @property
+    def restrict(self):
+        return self._url_match
+
+    @restrict.setter
+    def restrict(self, url_match):
+        self._url_match = re.compile(url_match)
+
+    @property
+    def exclude(self):
+        return self._exclude
+
+    @exclude.setter
+    def exclude(self, exclude):
+        self._exclude = re.compile(exclude)
+
+    @property
+    def verbose(self):
+        return self._verbose
+
+    @verbose.setter
+    def verbose(self, verbose):
+        self._verbose = verbose
+
+    @property
+    def debug(self):
+        return self._debug
+
+    @debug.setter
+    def debug(self, debug):
+        self._verbose = debug
+        self._debug = debug
+
+    @property
+    def report40x(self):
+        return self._report40x
+
+    @report40x.setter
+    def report40x(self, report40x):
+        self._report40x = report40x
+
+    @property
+    def wait_time(self):
+        return self._wait_time
+
+    @wait_time.setter
+    def wait_time(self, seconds):
+        if seconds >= 0:
+            self._wait_time = seconds
+
+    @property
+    def polite_time(self):
+        return self.frontier.polite_time
+
+    @polite_time.setter
+    def polite_time(self, seconds):
+        if seconds >= 0:
+            self.frontier.polite_time = seconds
+
+    def crawl(self):
+        _starttime = time.time()
+        if self.restrict is None:
+            self.restrict = "http://%s.*" % self.init_domain
+
+        print("Deadlink-crawler version 1.1")
+        print("Starting crawl from URL %s at %s with restriction %s\n"
+              % (self.init_url,
+                 time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
+                 "http://%s.*" % self.init_domain))
+
+        while len(self.frontier) > 0:
+            time.sleep(self.wait_time)
+
+            next_time, next_url = self.frontier.next()
+
+            while time.time() < next_time:
+                time.sleep(0.5)
+
+            try:
+                self.visit_url(next_url[0], next_url[1])
+            except urllib.error.URLError:
+                continue
+
+        self.print_deadlinks(self.deadlinks)
+
+        _elapsed = time.time() - _starttime
+
+        print("\nSummary:\n--------")
+        print("Crawled %d pages and checked %d links in %s time."
+              % (self._pages, self._links,
+                 time.strftime("%H:%M:%S", time.gmtime(_elapsed))))
+        print("Found a total of %d deadlinks in %d different pages"
+              % (self._dead, self._via))
+
+        if len(self.deadlinks) == 0:
+            exit(0)
+        else:
+            exit(2)
+
+    def visit_url(self, url, found_via):
+        response = self.check_url(url, found_via)
+
+        self.frontier.notify_visit(url)
+
+        if response is not None and not self.excluded(url):
+            self.collect_new_urls(url, response.read())
+
+    def collect_new_urls(self, url, html):
+        if self._verbose:
+            print("Processing %s" % url)
+
+        # Keep track of how many of our site's pages we have crawled,
+        # and print status now and then
+        self._pages += 1
+        if self._pages % 100 == 0:
+            print("Processed %s links from %s pages"
+                  % (self._links, self._pages))
+
+        try:
+            for page in self.extract_urls(html):
+                if page is not None:
+                    page = page.strip()  # Handle some malformed links
+                page = urllib.parse.urljoin(url, page)
+                if self._exclude is not None and self._exclude.search(page):
+                    if self._debug:
+                        print("Not adding link %s to crawl backlog "
+                              + "(excluded by --exclude rule)" % page)
+                else:
+                    if self.frontier.add(page, url):
+                        if self._debug:
+                            print("Adding link %s to crawl backlog" % page)
+        except UnicodeEncodeError:
+            pass
+
+    def check_url(self, url, found_via):
+        if self._exclude is not None and self._exclude.search(url):
+            if self._debug:
+                print("Not checking URL %s (excluded by --exclude rule)" % url)
+            return None
+
+        if self._debug:
+            print("Checking URL: %s" % url)
+
+        self._links += 1
+        request = urllib.request.Request(url)
+
+        try:
+            response = urllib.request.urlopen(request, timeout=10)
+        except urllib.error.HTTPError as e:
+            # We receive an exception in case of 404
+            if (e.code == 403 or e.code == 401 or e.code == 407
+                    or e.code == 415) and not self._report40x:
+                if self._debug:
+                    print("Got HTTP %s - not adding to deadlinks list "
+                          + "(control with --report40x=True)" % (e.code))
+            else:
+                if self._debug:
+                    print("Got HTTP %s - Adding to deadlinks list" % (e.code))
+                self.add_to_deadlinks(url, found_via)
+            return None
+        except http.client.BadStatusLine:
+            if self._verbose:
+                print("Got Exception BadStatusLine for url %s - Adding to "
+                      + "deadlinks list" % url)
+            self.add_to_deadlinks(url, found_via)
+            return None
+        except UnicodeEncodeError:
+            if self._verbose:
+                print("Got UnicodeEncodeError for url %s, skipping" % url)
+            return None
+        except urllib.error.URLError as e:
+            if self._verbose:
+                print("Got URLError for page %s" % url)
+            return None
+        except socket.timeout as e:
+            print(type(e))  # catched
+            if self._verbose:
+                print("Got timeout reading page %s, skipping" % url)
+            return None
+
+        status = response.getcode()
+        redirurl = response.geturl()
+        if url != redirurl:
+            if self._debug:
+                print("Followed redirect from %s to %s" % (url, redirurl))
+            url = redirurl
+        if status is not None and status >= 400:
+            self.add_to_deadlinks(url, found_via)
+
+        return response
+
+    def add_to_deadlinks(self, url, found_via):
+        self.deadlinks.setdefault(found_via, [])
+        self.deadlinks[found_via].append(url)
+
+        self._dead += 1
+
+        if self._verbose:
+            print("  Found deadlink: %s" % url)
+
+    def extract_urls(self, page):
+        soup = BeautifulSoup(page, "html.parser")
+        return [link.get('href') for link in soup.findAll('a')]
+
+    def excluded(self, url):
+        outside = self._url_match is not None \
+                  and not self._url_match.search(url)
+        excluded = self._exclude is not None and self._exclude.search(url)
+        if excluded and self._debug:
+            print("Not following URL %s which is excluded by --exclude rule"
+                  % url)
+        return outside or excluded
+
+    def print_deadlinks(self, deadlinks):
+        if len(deadlinks) == 0:
+            print("\nNo deadlinks were found. Hooray!")
+        else:
+            print("\nThe following deadlinks were found\n")
+            for via in deadlinks:
+                self._via += 1
+                print("%s" % via)
+                for target in deadlinks[via]:
+                    print("\t%s" % target)
 
 
 if __name__ == "__main__":
-	parser = argparse.ArgumentParser(description="Search a website for deadlinks")
-	parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl")
-	parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain")
-	parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch (default=0)")
-	parser.add_argument('--politeness', dest='polite_time', type=float, help="Set the time to wait between calling two URLs of the same domain (default=1)")
-	parser.add_argument('--exclude', dest='exclude', help="Exclude URLs matching the given regex from crawl and deadlink-checking")
-	parser.add_argument('--silent', dest='silent', action='store_true', default=False, help="Turn off verbose output")
-	parser.add_argument('--debug', dest='debug', action='store_true', default=False, help="Be super-verbose")
-	parser.add_argument('--report40x', dest='report40x', action='store_true', default=False, help="Report only 404 as dead, not the other 40x errors")
-
-	args = parser.parse_args()
-
-	c = Crawler(args.url)
-	if args.restrict:
-		c.restrict = args.restrict
-	if args.wait_time:
-		c.wait_time = args.wait_time
-	if args.polite_time:
-		c.polite_time = args.polite_time
-	if args.silent:
-		c.verbose = not args.silent
-	if args.debug:
-		c.debug = args.debug
-	if args.report40x:
-		c.report40x = args.report40x
-	if args.exclude:
-		c.exclude = args.exclude
-	c.crawl()
+    parser = argparse.ArgumentParser(
+            description="Search a website for deadlinks")
+    parser.add_argument('url', metavar='URL', type=str,
+                        help="The starting point for your crawl")
+    parser.add_argument('--restrict', dest='restrict',
+                        help="Restrict the crawl to specific URLs via a "
+                             + "regular expression (usually your own domain")
+    parser.add_argument('--wait', dest='wait_time', type=float,
+                        help="Set some waiting time between each URL "
+                             + "fetch (default=0)")
+    parser.add_argument('--politeness', dest='polite_time', type=float,
+                        help="Set the time to wait between calling two URLs "
+                             + "of the same domain (default=1)")
+    parser.add_argument('--exclude', dest='exclude',
+                        help="Exclude URLs matching the given regex from "
+                             + "crawl and deadlink-checking")
+    parser.add_argument('--silent', dest='silent', action='store_true',
+                        default=False, help="Turn off verbose output")
+    parser.add_argument('--debug', dest='debug', action='store_true',
+                        default=False, help="Be super-verbose")
+    parser.add_argument('--report40x', dest='report40x', action='store_true',
+                        default=False,
+                        help="Report only 404 as dead, not the other "
+                             + "40x errors")
+
+    args = parser.parse_args()
+
+    c = Crawler(args.url)
+    if args.restrict:
+        c.restrict = args.restrict
+    if args.wait_time:
+        c.wait_time = args.wait_time
+    if args.polite_time:
+        c.polite_time = args.polite_time
+    if args.silent:
+        c.verbose = not args.silent
+    if args.debug:
+        c.debug = args.debug
+    if args.report40x:
+        c.report40x = args.report40x
+    if args.exclude:
+        c.exclude = args.exclude
+    c.crawl()
diff --git a/frontier.py b/frontier.py
@@ -16,86 +16,87 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
 
-import urlparse
+import urllib.parse
 import time
 import heapq
 
+
 class Frontier(object):
-	def __init__(self):
-		# A list containing the next crawltimes on domain level, 
-		# to achieve a optimal throughput maintaining a polite policy
-		self.frontier = []
-		self.websites = {}
-		
-		# Urls we have already found and in our set
-		self.found = set()
-		
-		self._polite_time = 1
-	
-	@property
-	def polite_time(self):
-		return self._polite_time
-	
-	@polite_time.setter
-	def polite_time(self, seconds):
-		if seconds >= 0:
-			self._polite_time = seconds
-			
-	def add(self, url, found_via, defrag=True):
-		if defrag:
-			url, frag = urlparse.urldefrag(url)
-		if url in self.found:
-			return False
-
-		domain = urlparse.urlparse(url).netloc
-		
-		# means this is the first URL in our set
-		if domain in self.websites:
-			website = self.websites[domain]
-		else:
-			website = Website(domain)
-			heapq.heappush(self.frontier, (time.time(), website))
-			self.websites[domain] = website
-		
-		website.add_url(url, found_via)
-		self.found.add(url)
-		
-		return True
-	
-	def next(self):
-		next_time, next_domain = heapq.heappop(self.frontier)
-		
-		next_url = next_domain.next_url()
-		
-		return next_time, next_url
-	
-	def notify_visit(self, url):
-		domain = urlparse.urlparse(url).netloc
-		website = self.websites[domain]
-		
-		# If there are still other urls on this domain to crawl, add crawl time
-		if len(website.urls) > 0:
-			heapq.heappush(self.frontier, (time.time() + self.polite_time, website))
-		else:
-			del(self.websites[domain])
-	
-	def __len__(self):
-		return sum([len(website.urls) for time, website 
-			in self.frontier])
+    def __init__(self):
+        # A list containing the next crawltimes on domain level,
+        # to achieve a optimal throughput maintaining a polite policy
+        self.frontier = []
+        self.websites = {}
+
+        # Urls we have already found and in our set
+        self.found = set()
+
+        self._polite_time = 1
+
+    @property
+    def polite_time(self):
+        return self._polite_time
+
+    @polite_time.setter
+    def polite_time(self, seconds):
+        if seconds >= 0:
+            self._polite_time = seconds
+
+    def add(self, url, found_via, defrag=True):
+        if defrag:
+            url, frag = urllib.parse.urldefrag(url)
+        if url in self.found:
+            return False
+
+        domain = urllib.parse.urlparse(url).netloc
+
+        # means this is the first URL in our set
+        if domain in self.websites:
+            website = self.websites[domain]
+        else:
+            website = Website(domain)
+            heapq.heappush(self.frontier, (time.time(), website))
+            self.websites[domain] = website
+
+        website.add_url(url, found_via)
+        self.found.add(url)
+
+        return True
+
+    def next(self):
+        next_time, next_domain = heapq.heappop(self.frontier)
+
+        next_url = next_domain.next_url()
+
+        return next_time, next_url
+
+    def notify_visit(self, url):
+        domain = urllib.parse.urlparse(url).netloc
+        website = self.websites[domain]
+
+        # If there are still other urls on this domain to crawl, add crawl time
+        if len(website.urls) > 0:
+            heapq.heappush(self.frontier,
+                           (time.time() + self.polite_time, website))
+        else:
+            del(self.websites[domain])
+
+    def __len__(self):
+        return sum([len(website.urls) for time, website in self.frontier])
 
 
 class Website(object):
-	def __init__(self, domain):
-		self.domain = domain
-		self.urls = []
-		self.robots = None
-	
-	def is_allowed(self, url):
-		# TODO
-		return True
-	
-	def add_url(self, url, found_via):
-		self.urls.append((url, found_via))
-	
-	def next_url(self):
-		return self.urls.pop()
+    def __init__(self, domain):
+        self.domain = domain
+        self.urls = []
+        self.robots = None
+
+    def is_allowed(self, url):
+        # TODO
+        return True
+
+    def add_url(self, url, found_via):
+        self.urls.append((url, found_via))
+
+    def next_url(self):
+        return self.urls.pop()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+beautifulsoup4

	deadlink-crawler [unmaintained] crawls a site to detect dead links
	Log \| Files \| Refs \| README

M	.gitignore	\|	2	++
M	crawler.py	\|	591	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	frontier.py	\|	153	++++++++++++++++++++++++++++++++++++++++---------------------------------------
A	requirements.txt	\|	1	+