added website class for later usage - deadlink-crawler - [unmaintained] crawls a site to detect dead links

commit a1612806241d715e7e92293e1f928a35ba62ba02
parent 7924e13a2029fe3faed9321bad84bb9b21629a0f
Author: Stefan <stefan@eliteinformatiker.de>
Date:   Tue,  5 Feb 2013 14:41:38 +0100

added website class for later usage

Diffstat:
M crawler.py  | 3 ++-
M frontier.py  | 54 ++++++++++++++++++++++++++++++++++++------------------

2 files changed, 38 insertions(+), 19 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -72,7 +72,7 @@ class Crawler(object):
 		response = self.check_url(url, found_via)
 		
 		self.frontier.notify_visit(url)
-		
+			
 		if response != None and not self.excluded(url):
 			self.collect_new_urls(url, response.read())
 	
@@ -82,6 +82,7 @@ class Crawler(object):
 		try:
 			for page in self.extract_urls(html):
 				page = urlparse.urljoin(url, page)
+				print("adding page %s" % page)
 				self.frontier.add(page, url)
 		except UnicodeEncodeError:
 			pass
diff --git a/frontier.py b/frontier.py
@@ -4,13 +4,10 @@ import heapq
 
 class Frontier(object):
 	def __init__(self):
-		# A list of urls that still have to be searched sorted by
-		# domains, 
-		self.urls = {}
-		
 		# A list containing the next crawltimes on domain level, 
 		# to achieve a optimal throughput maintaining a polite policy
-		self.crawltimes = []
+		self.frontier = []
+		self.websites = {}
 		
 		# Urls we have already found and in our set
 		self.found = set()
@@ -33,31 +30,52 @@ class Frontier(object):
 		domain = urlparse.urlparse(url).netloc
 		
 		# means this is the first URL in our set
-		if not domain in self.urls:
-			self.urls[domain] = []
-			heapq.heappush(self.crawltimes, (time.time(), domain))
+		if domain in self.websites:
+			website = self.websites[domain]
+		else:
+			website = Website(domain)
+			heapq.heappush(self.frontier, (time.time(), website))
+			self.websites[domain] = website
 		
-		self.urls[domain].append((url, found_via))
+		website.add_url(url, found_via)
 		self.found.add(url)
 		
 		return True
 	
 	def next(self):
-		next_time, next_domain = heapq.heappop(self.crawltimes)
-		
-		next_url = self.urls[next_domain].pop()
+		next_time, next_domain = heapq.heappop(self.frontier)
 		
-		if len(self.urls[next_domain]) == 0:
-			del(self.urls[next_domain])
+		next_url = next_domain.next_url()
 		
 		return next_time, next_url
 	
 	def notify_visit(self, url):
 		domain = urlparse.urlparse(url).netloc
-				
+		website = self.websites[domain]
+		
 		# If there are still other urls on this domain to crawl, add crawl time
-		if domain in self.urls:
-			heapq.heappush(self.crawltimes, (time.time() + self.polite_time, domain))
+		if len(website.urls) > 0:
+			heapq.heappush(self.frontier, (time.time() + self.polite_time, website))
+		else:
+			del(self.websites[domain])
 	
 	def __len__(self):
-		return sum([len(self.urls[domain]) for domain in self.urls])
+		return sum([len(website.urls) for time, website 
+			in self.frontier])
+
+
+class Website(object):
+	def __init__(self, domain):
+		self.domain = domain
+		self.urls = []
+		self.robots = None
+	
+	def is_allowed(self, url):
+		# TODO
+		return True
+	
+	def add_url(self, url, found_via):
+		self.urls.append((url, found_via))
+	
+	def next_url(self):
+		return self.urls.pop()

	deadlink-crawler [unmaintained] crawls a site to detect dead links
	Log \| Files \| Refs \| README

M	crawler.py	\|	3	++-
M	frontier.py	\|	54	++++++++++++++++++++++++++++++++++++------------------