deadlink-crawler

[unmaintained] crawls a site to detect dead links
Log | Files | Refs | README

commit 6f83c00ddd3ab4a45b07da87ace0a3492dbd80c9
parent ec4b6107255053ec1c9e88cdf79bfdad4e5508cd
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Mon, 28 Jan 2013 13:48:31 +0100

added python properties and cli politeness

Diffstat:
Mcrawler.py | 85+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 66 insertions(+), 19 deletions(-)

diff --git a/crawler.py b/crawler.py @@ -1,18 +1,23 @@ from BeautifulSoup import BeautifulSoup -import urllib2, urlparse +import urllib2, httplib, urlparse import re import time -import httplib +import heapq import argparse -# TODO: Do not apply wait time to external links - class Crawler(object): def __init__(self, init_url): - # A list of to be crawled urls, where the second element - # in the tuple is the URL via which we found this URL - self.pages = [(init_url, None)] + init_domain = urlparse.urlparse(init_url).netloc + + # A list of urls that still have to be searched sorted by + # domains, + self.frontier = {} + self.frontier[init_domain] = [(init_url, None)] + + # A list containing the next crawltimes on domain level, + # to achieve a optimal throughput maintaining a polite policy + self.crawltimes = [(time.time(), init_domain)] # Urls we have already visited self.found = set() @@ -23,22 +28,57 @@ class Crawler(object): # Regular expression for URLs we are interested in (our internal # URLs) - self.url_match = None + self._url_match = None # Timeout in seconds to wait, so that we do not kill our server - self.wait_time = 0 + self._wait_time = 0 + + # Timeout for waiting between each call to the same + # domain twice, this determines how polite the crawler is + self._polite_time = 1 + + @property + def restrict(self): + return self._url_match - def set_url_restrict(self, regexp): - self.url_match = re.compile(regexp) + @restrict.setter + def restrict(self, url_match): + self._url_match = re.compile(url_match) - def set_wait_time(self, seconds): - self.wait_time = seconds + @property + def wait_time(self): + return self._wait_time + + @wait_time.setter + def wait_time(self, seconds): + if seconds >= 0: + self._wait_time = seconds + + @property + def polite_time(self): + return self._polite_time + + @polite_time.setter + def polite_time(self, seconds): + if seconds >= 0: + self._polite_time = seconds def crawl(self): - while len(self.pages) > 0: + while len(self.frontier) > 0: time.sleep(self.wait_time) - next_url = self.pages.pop() + next_time, next_domain = heapq.heappop(self.crawltimes) + next_url = self.frontier[next_domain].pop() + + while time.time() < next_time: + time.sleep(0.5) + + if len(self.frontier[next_domain]) > 0: + next_crawl = time.time() + self.polite_time + heapq.heappush(self.crawltimes, + (next_crawl, next_domain)) + else: + del(self.frontier[next_domain]) try: self.visit_url(next_url[0], next_url[1]) @@ -59,9 +99,13 @@ class Crawler(object): try: for page in self.extract_urls(html): page = urlparse.urljoin(url, page) + domain = urlparse.urlparse(page).netloc if not page in self.found: - self.pages.append((page, url)) + if not domain in self.frontier: + self.frontier.setdefault(domain, []) + heapq.heappush(self.crawltimes, (time.time(), domain)) + self.frontier[domain].append((page, url)) self.found.add(page) except UnicodeEncodeError: pass @@ -95,7 +139,7 @@ class Crawler(object): return [link.get('href') for link in soup.findAll('a')] def excluded(self, url): - return self.url_match != None and not self.url_match.search(url) + return self._url_match != None and not self._url_match.search(url) def print_deadlinks(self, deadlinks): if len(deadlinks) == 0: @@ -114,12 +158,15 @@ if __name__ == "__main__": parser.add_argument('url', metavar='URL', type=str, help="The starting point for your crawl") parser.add_argument('--restrict', dest='restrict', help="Restrict the crawl to specific URLs via a regular expression (usually your own domain") parser.add_argument('--wait', dest='wait_time', type=float, help="Set some waiting time between each URL fetch") + parser.add_argument('--politeness', dest='polite_time', type=float, help="Set the time to wait between calling two URLs of the same domain") args = parser.parse_args() c = Crawler(args.url) if args.restrict: - c.set_url_restrict(args.restrict) + c.restrict = args.restrict if args.wait_time: - c.set_wait_time(args.wait_time) + c.wait_time = args.wait_time + if args.polite_time: + c.polite_time = args.polite_time c.crawl()