commit 74551fb8b5e0e1854774726ded07b20690a85306
parent 8a130178da817297c47a7e5bedaad4fd5bfa38c9
Author: Stefan Koch <misc@stefan-koch.name>
Date: Fri, 18 May 2018 19:52:00 +0200
make python3 ready
Diffstat:
M | .gitignore | | | 2 | ++ |
M | crawler.py | | | 591 | ++++++++++++++++++++++++++++++++++++++++++------------------------------------- |
M | frontier.py | | | 153 | ++++++++++++++++++++++++++++++++++++++++--------------------------------------- |
A | requirements.txt | | | 1 | + |
4 files changed, 391 insertions(+), 356 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+env
+
*.py[cod]
# C extensions
diff --git a/crawler.py b/crawler.py
@@ -16,292 +16,323 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
-from BeautifulSoup import BeautifulSoup
-from time import gmtime, strftime, time
-import urllib2, httplib, urlparse
-import re
+from bs4 import BeautifulSoup
import time
-import sys
+import urllib.request
+import urllib.error
+import urllib.parse
+import http.client
+import re
import argparse
import socket
import frontier
+
class Crawler(object):
- def __init__(self, init_url):
- self.init_url = init_url
- self.init_domain = urlparse.urlparse(init_url).netloc
-
- # Manages our domains we want to visit or have visited
- self.frontier = frontier.Frontier()
- self.frontier.add(init_url, None)
-
- # List of deadlinks for each URL we have,
- # i.e. url1: [deadlink1, deadlink2]
- self.deadlinks = {}
-
- # Regular expression for URLs we are interested in (our internal
- # URLs)
- self._url_match = None
-
- # Regular expression for URLs we are interested in (our internal
- # URLs)
- self._exclude = None
-
- # Timeout in seconds to wait, so that we do not kill our server
- self._wait_time = 0
-
- # Verbose
- self._verbose = True
-
- # Debug
- self._debug = False
-
- # Report 40x http codes as deadlinks
- self._report40x = False
-
- # For progress reporting
- self._pages = 0
- self._links = 0
- self._via = 0
- self._dead = 0
-
- @property
- def restrict(self):
- return self._url_match
-
- @restrict.setter
- def restrict(self, url_match):
- self._url_match = re.compile(url_match)
-
- @property
- def exclude(self):
- return self._exclude
-
- @exclude.setter
- def exclude(self, exclude):
- self._exclude = re.compile(exclude)
-
- @property
- def verbose(self):
- return self._verbose
-
- @verbose.setter
- def verbose(self, verbose):
- self._verbose = verbose
-
- @property
- def debug(self):
- return self._debug
-
- @debug.setter
- def debug(self, debug):
- self._verbose = debug
- self._debug = debug
-
- @property
- def report40x(self):
- return self._report40x
-
- @report40x.setter
- def report40x(self, report40x):
- self._report40x = report40x
-
- @property
- def wait_time(self):
- return self._wait_time
-
- @wait_time.setter
- def wait_time(self, seconds):
- if seconds >= 0:
- self._wait_time = seconds
-
- @property
- def polite_time(self):
- return self.frontier.polite_time
-
- @polite_time.setter
- def polite_time(self, seconds):
- if seconds >= 0:
- self.frontier.polite_time = seconds
-
- def crawl(self):
- _starttime = time.time()
- if self.restrict == None:
- self.restrict = "http://%s.*" % self.init_domain
-
- print "Deadlink-crawler version 1.1"
- print "Starting crawl from URL %s at %s with restriction %s\n" % (self.init_url, strftime("%Y-%m-%d %H:%M:%S", gmtime()), "http://%s.*" % self.init_domain)
-
- while len(self.frontier) > 0:
- time.sleep(self.wait_time)
-
- next_time, next_url = self.frontier.next()
-
- while time.time() < next_time:
- time.sleep(0.5)
-
- try:
- self.visit_url(next_url[0], next_url[1])
- except urllib2.URLError:
- continue
-
- self.print_deadlinks(self.deadlinks)
-
- _elapsed = time.time() - _starttime
-
- print "\nSummary:\n--------"
- print "Crawled %d pages and checked %d links in %s time." % (self._pages, self._links, strftime("%H:%M:%S", gmtime(_elapsed)))
- print "Found a total of %d deadlinks in %d different pages" % (self._dead, self._via)
-
- if len(self.deadlinks) == 0:
- exit(0)
- else:
- exit(2)
-
- def visit_url(self, url, found_via):
- response = self.check_url(url, found_via)
-
- self.frontier.notify_visit(url)
-
- if response != None and not self.excluded(url):
- self.collect_new_urls(url, response.read())
-
- def collect_new_urls(self, url, html):
- if self._verbose:
- print("Processing %s" % url)
-
- # Keep track of how many of our site's pages we have crawled, and print status now and then
- self._pages += 1
- if self._pages % 100 == 0:
- print >> sys.stderr, "Processed %s links from %s pages" % (self._links, self._pages)
-
- try:
- for page in self.extract_urls(html):
- if page != None:
- page = page.strip() # Handle some malformed links
- page = urlparse.urljoin(url, page)
- if self._exclude != None and self._exclude.search(page):
- if self._debug:
- print "Not adding link %s to crawl backlog (excluded by --exclude rule)" % page
- else:
- if self.frontier.add(page, url):
- if self._debug:
- print("Adding link %s to crawl backlog" % page)
- except UnicodeEncodeError:
- pass
-
- def check_url(self, url, found_via):
- if self._exclude != None and self._exclude.search(url):
- if self._debug:
- print "Not checking URL %s (excluded by --exclude rule)" % url
- return None
-
- if self._debug:
- print("Checking URL: %s" % url)
-
- self._links += 1
- request = urllib2.Request(url)
-
- try:
- response = urllib2.urlopen(request, timeout=10)
- except urllib2.HTTPError as e:
- # We receive an exception in case of 404
- if (e.code == 403 or e.code == 401 or e.code == 407 or e.code == 415) and not self._report40x:
- if self._debug:
- print "Got HTTP %s - not adding to deadlinks list (control with --report40x=True)" % (e.code)
- else:
- if self._debug:
- print "Got HTTP %s - Adding to deadlinks list" % (e.code)
- self.add_to_deadlinks(url, found_via)
- return None
- except httplib.BadStatusLine:
- if self._verbose:
- print "Got Exception BadStatusLine for url %s - Adding to deadlinks list" % url
- self.add_to_deadlinks(url, found_via)
- return None
- except UnicodeEncodeError:
- if self._verbose:
- print "Got UnicodeEncodeError for url %s, skipping" % url
- return None
- except urllib2.URLError as e:
- if self._verbose:
- print "Got URLError for page %s" % url
- return None
- except socket.timeout as e:
- print type(e) #catched
- if self._verbose:
- print "Got timeout reading page %s, skipping" % url
- return None
-
- status = response.getcode()
- redirurl = response.geturl()
- if url != redirurl:
- if self._debug:
- print "Followed redirect from %s to %s" % (url, redirurl)
- url = redirurl
- if status != None and status >= 400:
- self.add_to_deadlinks(url, found_via)
-
- return response
-
- def add_to_deadlinks(self, url, found_via):
- self.deadlinks.setdefault(found_via, [])
- self.deadlinks[found_via].append(url)
-
- self._dead += 1
-
- if self._verbose:
- print " Found deadlink: %s" % url
-
- def extract_urls(self, page):
- soup = BeautifulSoup(page)
- return [link.get('href') for link in soup.findAll('a')]
-
- def excluded(self, url):
- outside = self._url_match != None and not self._url_match.search(url)
- excluded = self._exclude != None and self._exclude.search(url)
- if excluded and self._debug:
- print "Not following URL %s which is excluded by --exclude rule" % url
- return outside or excluded
-
- def print_deadlinks(self, deadlinks):
- if len(deadlinks) == 0:
- print("\nNo deadlinks were found. Hooray!")
- else:
- print("\nThe following deadlinks were found\n")
- for via in deadlinks:
- self._via += 1
- print("%s" % via)
- for target in deadlinks[via]:
- print("\t%s" % target)
+ def __init__(self, init_url):
+ self.init_url = init_url
+ self.init_domain = urllib.parse.urlparse(init_url).netloc
+
+ # Manages our domains we want to visit or have visited
+ self.frontier = frontier.Frontier()
+ self.frontier.add(init_url, None)
+
+ # List of deadlinks for each URL we have,
+ # i.e. url1: [deadlink1, deadlink2]
+ self.deadlinks = {}
+
+ # Regular expression for URLs we are interested in (our internal
+ # URLs)
+ self._url_match = None
+
+ # Regular expression for URLs we are interested in (our internal
+ # URLs)
+ self._exclude = None
+
+ # Timeout in seconds to wait, so that we do not kill our server
+ self._wait_time = 0
+
+ # Verbose
+ self._verbose = True
+
+ # Debug
+ self._debug = False
+
+ # Report 40x http codes as deadlinks
+ self._report40x = False
+
+ # For progress reporting
+ self._pages = 0
+ self._links = 0
+ self._via = 0
+ self._dead = 0
+
+ @property
+ def restrict(self):
+ return self._url_match
+
+ @restrict.setter
+ def restrict(self, url_match):
+ self._url_match = re.compile(url_match)
+
+ @property
+ def exclude(self):
+ return self._exclude
+
+ @exclude.setter
+ def exclude(self, exclude):
+ self._exclude = re.compile(exclude)
+
+ @property
+ def verbose(self):
+ return self._verbose
+
+ @verbose.setter
+ def verbose(self, verbose):
+ self._verbose = verbose
+
+ @property
+ def debug(self):
+ return self._debug
+
+ @debug.setter
+ def debug(self, debug):
+ self._verbose = debug
+ self._debug = debug
+
+ @property
+ def report40x(self):
+ return self._report40x
+
+ @report40x.setter
+ def report40x(self, report40x):
+ self._report40x = report40x
+
+ @property
+ def wait_time(self):
+ return self._wait_time
+
+ @wait_time.setter
+ def wait_time(self, seconds):
+ if seconds >= 0:
+ self._wait_time = seconds
+
+ @property
+ def polite_time(self):
+ return self.frontier.polite_time
+
+ @polite_time.setter
+ def polite_time(self, seconds):
+ if seconds >= 0:
+ self.frontier.polite_time = seconds
+
+ def crawl(self):
+ _starttime = time.time()
+ if self.restrict is None:
+ self.restrict = "http://%s.*" % self.init_domain
+
+ print("Deadlink-crawler version 1.1")
+ print("Starting crawl from URL %s at %s with restriction %s\n"
+ % (self.init_url,
+ time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()),
+ "http://%s.*" % self.init_domain))
+
+ while len(self.frontier) > 0:
+ time.sleep(self.wait_time)
+
+ next_time, next_url = self.frontier.next()
+
+ while time.time() < next_time:
+ time.sleep(0.5)
+
+ try:
+ self.visit_url(next_url[0], next_url[1])
+ except urllib.error.URLError:
+ continue
+
+ self.print_deadlinks(self.deadlinks)
+
+ _elapsed = time.time() - _starttime
+
+ print("\nSummary:\n--------")
+ print("Crawled %d pages and checked %d links in %s time."
+ % (self._pages, self._links,
+ time.strftime("%H:%M:%S", time.gmtime(_elapsed))))
+ print("Found a total of %d deadlinks in %d different pages"
+ % (self._dead, self._via))
+
+ if len(self.deadlinks) == 0:
+ exit(0)
+ else:
+ exit(2)
+
+ def visit_url(self, url, found_via):
+ response = self.check_url(url, found_via)
+
+ self.frontier.notify_visit(url)
+
+ if response is not None and not self.excluded(url):
+ self.collect_new_urls(url, response.read())
+
+ def collect_new_urls(self, url, html):
+ if self._verbose:
+ print("Processing %s" % url)
+
+ # Keep track of how many of our site's pages we have crawled,
+ # and print status now and then
+ self._pages += 1
+ if self._pages % 100 == 0:
+ print("Processed %s links from %s pages"
+ % (self._links, self._pages))
+
+ try:
+ for page in self.extract_urls(html):
+ if page is not None:
+ page = page.strip() # Handle some malformed links
+ page = urllib.parse.urljoin(url, page)
+ if self._exclude is not None and self._exclude.search(page):
+ if self._debug:
+ print("Not adding link %s to crawl backlog "
+ + "(excluded by --exclude rule)" % page)
+ else:
+ if self.frontier.add(page, url):
+ if self._debug:
+ print("Adding link %s to crawl backlog" % page)
+ except UnicodeEncodeError:
+ pass
+
+ def check_url(self, url, found_via):
+ if self._exclude is not None and self._exclude.search(url):
+ if self._debug:
+ print("Not checking URL %s (excluded by --exclude rule)" % url)
+ return None
+
+ if self._debug:
+ print("Checking URL: %s" % url)
+
+ self._links += 1
+ request = urllib.request.Request(url)
+
+ try:
+ response = urllib.request.urlopen(request, timeout=10)
+ except urllib.error.HTTPError as e:
+ # We receive an exception in case of 404
+ if (e.code == 403 or e.code == 401 or e.code == 407
+ or e.code == 415) and not self._report40x:
+ if self._debug:
+ print("Got HTTP %s - not adding to deadlinks list "
+ + "(control with --report40x=True)" % (e.code))
+ else:
+ if self._debug:
+ print("Got HTTP %s - Adding to deadlinks list" % (e.code))
+ self.add_to_deadlinks(url, found_via)
+ return None
+ except http.client.BadStatusLine:
+ if self._verbose:
+ print("Got Exception BadStatusLine for url %s - Adding to "
+ + "deadlinks list" % url)
+ self.add_to_deadlinks(url, found_via)
+ return None
+ except UnicodeEncodeError:
+ if self._verbose:
+ print("Got UnicodeEncodeError for url %s, skipping" % url)
+ return None
+ except urllib.error.URLError as e:
+ if self._verbose:
+ print("Got URLError for page %s" % url)
+ return None
+ except socket.timeout as e:
+ print(type(e)) # catched
+ if self._verbose:
+ print("Got timeout reading page %s, skipping" % url)
+ return None
+
+ status = response.getcode()
+ redirurl = response.geturl()
+ if url != redirurl:
+ if self._debug:
+ print("Followed redirect from %s to %s" % (url, redirurl))
+ url = redirurl
+ if status is not None and status >= 400:
+ self.add_to_deadlinks(url, found_via)
+
+ return response
+
+ def add_to_deadlinks(self, url, found_via):
+ self.deadlinks.setdefault(found_via, [])
+ self.deadlinks[found_via].append(url)
+
+ self._dead += 1
+
+ if self._verbose:
+ print(" Found deadlink: %s" % url)
+
+ def extract_urls(self, page):
+ soup = BeautifulSoup(page, "html.parser")
+ return [link.get('href') for link in soup.findAll('a')]
+
+ def excluded(self, url):
+ outside = self._url_match is not None \
+ and not self._url_match.search(url)
+ excluded = self._exclude is not None and self._exclude.search(url)
+ if excluded and self._debug:
+ print("Not following URL %s which is excluded by --exclude rule"
+ % url)
+ return outside or excluded
+
+ def print_deadlinks(self, deadlinks):
+ if len(deadlinks) == 0:
+ print("\nNo deadlinks were found. Hooray!")
+ else:
+ print("\nThe following deadlinks were found\n")
+ for via in deadlinks:
+ self._via += 1
+ print("%s" % via)
+ for target in deadlinks[via]:
+ print("\t%s" % target)
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Search a website for deadlinks")
- parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl")
- parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain")
- parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch (default=0)")
- parser.add_argument('--politeness', dest='polite_time', type=float, help="Set the time to wait between calling two URLs of the same domain (default=1)")
- parser.add_argument('--exclude', dest='exclude', help="Exclude URLs matching the given regex from crawl and deadlink-checking")
- parser.add_argument('--silent', dest='silent', action='store_true', default=False, help="Turn off verbose output")
- parser.add_argument('--debug', dest='debug', action='store_true', default=False, help="Be super-verbose")
- parser.add_argument('--report40x', dest='report40x', action='store_true', default=False, help="Report only 404 as dead, not the other 40x errors")
-
- args = parser.parse_args()
-
- c = Crawler(args.url)
- if args.restrict:
- c.restrict = args.restrict
- if args.wait_time:
- c.wait_time = args.wait_time
- if args.polite_time:
- c.polite_time = args.polite_time
- if args.silent:
- c.verbose = not args.silent
- if args.debug:
- c.debug = args.debug
- if args.report40x:
- c.report40x = args.report40x
- if args.exclude:
- c.exclude = args.exclude
- c.crawl()
+ parser = argparse.ArgumentParser(
+ description="Search a website for deadlinks")
+ parser.add_argument('url', metavar='URL', type=str,
+ help="The starting point for your crawl")
+ parser.add_argument('--restrict', dest='restrict',
+ help="Restrict the crawl to specific URLs via a "
+ + "regular expression (usually your own domain")
+ parser.add_argument('--wait', dest='wait_time', type=float,
+ help="Set some waiting time between each URL "
+ + "fetch (default=0)")
+ parser.add_argument('--politeness', dest='polite_time', type=float,
+ help="Set the time to wait between calling two URLs "
+ + "of the same domain (default=1)")
+ parser.add_argument('--exclude', dest='exclude',
+ help="Exclude URLs matching the given regex from "
+ + "crawl and deadlink-checking")
+ parser.add_argument('--silent', dest='silent', action='store_true',
+ default=False, help="Turn off verbose output")
+ parser.add_argument('--debug', dest='debug', action='store_true',
+ default=False, help="Be super-verbose")
+ parser.add_argument('--report40x', dest='report40x', action='store_true',
+ default=False,
+ help="Report only 404 as dead, not the other "
+ + "40x errors")
+
+ args = parser.parse_args()
+
+ c = Crawler(args.url)
+ if args.restrict:
+ c.restrict = args.restrict
+ if args.wait_time:
+ c.wait_time = args.wait_time
+ if args.polite_time:
+ c.polite_time = args.polite_time
+ if args.silent:
+ c.verbose = not args.silent
+ if args.debug:
+ c.debug = args.debug
+ if args.report40x:
+ c.report40x = args.report40x
+ if args.exclude:
+ c.exclude = args.exclude
+ c.crawl()
diff --git a/frontier.py b/frontier.py
@@ -16,86 +16,87 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
-import urlparse
+import urllib.parse
import time
import heapq
+
class Frontier(object):
- def __init__(self):
- # A list containing the next crawltimes on domain level,
- # to achieve a optimal throughput maintaining a polite policy
- self.frontier = []
- self.websites = {}
-
- # Urls we have already found and in our set
- self.found = set()
-
- self._polite_time = 1
-
- @property
- def polite_time(self):
- return self._polite_time
-
- @polite_time.setter
- def polite_time(self, seconds):
- if seconds >= 0:
- self._polite_time = seconds
-
- def add(self, url, found_via, defrag=True):
- if defrag:
- url, frag = urlparse.urldefrag(url)
- if url in self.found:
- return False
-
- domain = urlparse.urlparse(url).netloc
-
- # means this is the first URL in our set
- if domain in self.websites:
- website = self.websites[domain]
- else:
- website = Website(domain)
- heapq.heappush(self.frontier, (time.time(), website))
- self.websites[domain] = website
-
- website.add_url(url, found_via)
- self.found.add(url)
-
- return True
-
- def next(self):
- next_time, next_domain = heapq.heappop(self.frontier)
-
- next_url = next_domain.next_url()
-
- return next_time, next_url
-
- def notify_visit(self, url):
- domain = urlparse.urlparse(url).netloc
- website = self.websites[domain]
-
- # If there are still other urls on this domain to crawl, add crawl time
- if len(website.urls) > 0:
- heapq.heappush(self.frontier, (time.time() + self.polite_time, website))
- else:
- del(self.websites[domain])
-
- def __len__(self):
- return sum([len(website.urls) for time, website
- in self.frontier])
+ def __init__(self):
+ # A list containing the next crawltimes on domain level,
+ # to achieve a optimal throughput maintaining a polite policy
+ self.frontier = []
+ self.websites = {}
+
+ # Urls we have already found and in our set
+ self.found = set()
+
+ self._polite_time = 1
+
+ @property
+ def polite_time(self):
+ return self._polite_time
+
+ @polite_time.setter
+ def polite_time(self, seconds):
+ if seconds >= 0:
+ self._polite_time = seconds
+
+ def add(self, url, found_via, defrag=True):
+ if defrag:
+ url, frag = urllib.parse.urldefrag(url)
+ if url in self.found:
+ return False
+
+ domain = urllib.parse.urlparse(url).netloc
+
+ # means this is the first URL in our set
+ if domain in self.websites:
+ website = self.websites[domain]
+ else:
+ website = Website(domain)
+ heapq.heappush(self.frontier, (time.time(), website))
+ self.websites[domain] = website
+
+ website.add_url(url, found_via)
+ self.found.add(url)
+
+ return True
+
+ def next(self):
+ next_time, next_domain = heapq.heappop(self.frontier)
+
+ next_url = next_domain.next_url()
+
+ return next_time, next_url
+
+ def notify_visit(self, url):
+ domain = urllib.parse.urlparse(url).netloc
+ website = self.websites[domain]
+
+ # If there are still other urls on this domain to crawl, add crawl time
+ if len(website.urls) > 0:
+ heapq.heappush(self.frontier,
+ (time.time() + self.polite_time, website))
+ else:
+ del(self.websites[domain])
+
+ def __len__(self):
+ return sum([len(website.urls) for time, website in self.frontier])
class Website(object):
- def __init__(self, domain):
- self.domain = domain
- self.urls = []
- self.robots = None
-
- def is_allowed(self, url):
- # TODO
- return True
-
- def add_url(self, url, found_via):
- self.urls.append((url, found_via))
-
- def next_url(self):
- return self.urls.pop()
+ def __init__(self, domain):
+ self.domain = domain
+ self.urls = []
+ self.robots = None
+
+ def is_allowed(self, url):
+ # TODO
+ return True
+
+ def add_url(self, url, found_via):
+ self.urls.append((url, found_via))
+
+ def next_url(self):
+ return self.urls.pop()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+beautifulsoup4