commit a1612806241d715e7e92293e1f928a35ba62ba02
parent 7924e13a2029fe3faed9321bad84bb9b21629a0f
Author: Stefan <stefan@eliteinformatiker.de>
Date: Tue, 5 Feb 2013 14:41:38 +0100
added website class for later usage
Diffstat:
2 files changed, 38 insertions(+), 19 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -72,7 +72,7 @@ class Crawler(object):
response = self.check_url(url, found_via)
self.frontier.notify_visit(url)
-
+
if response != None and not self.excluded(url):
self.collect_new_urls(url, response.read())
@@ -82,6 +82,7 @@ class Crawler(object):
try:
for page in self.extract_urls(html):
page = urlparse.urljoin(url, page)
+ print("adding page %s" % page)
self.frontier.add(page, url)
except UnicodeEncodeError:
pass
diff --git a/frontier.py b/frontier.py
@@ -4,13 +4,10 @@ import heapq
class Frontier(object):
def __init__(self):
- # A list of urls that still have to be searched sorted by
- # domains,
- self.urls = {}
-
# A list containing the next crawltimes on domain level,
# to achieve a optimal throughput maintaining a polite policy
- self.crawltimes = []
+ self.frontier = []
+ self.websites = {}
# Urls we have already found and in our set
self.found = set()
@@ -33,31 +30,52 @@ class Frontier(object):
domain = urlparse.urlparse(url).netloc
# means this is the first URL in our set
- if not domain in self.urls:
- self.urls[domain] = []
- heapq.heappush(self.crawltimes, (time.time(), domain))
+ if domain in self.websites:
+ website = self.websites[domain]
+ else:
+ website = Website(domain)
+ heapq.heappush(self.frontier, (time.time(), website))
+ self.websites[domain] = website
- self.urls[domain].append((url, found_via))
+ website.add_url(url, found_via)
self.found.add(url)
return True
def next(self):
- next_time, next_domain = heapq.heappop(self.crawltimes)
-
- next_url = self.urls[next_domain].pop()
+ next_time, next_domain = heapq.heappop(self.frontier)
- if len(self.urls[next_domain]) == 0:
- del(self.urls[next_domain])
+ next_url = next_domain.next_url()
return next_time, next_url
def notify_visit(self, url):
domain = urlparse.urlparse(url).netloc
-
+ website = self.websites[domain]
+
# If there are still other urls on this domain to crawl, add crawl time
- if domain in self.urls:
- heapq.heappush(self.crawltimes, (time.time() + self.polite_time, domain))
+ if len(website.urls) > 0:
+ heapq.heappush(self.frontier, (time.time() + self.polite_time, website))
+ else:
+ del(self.websites[domain])
def __len__(self):
- return sum([len(self.urls[domain]) for domain in self.urls])
+ return sum([len(website.urls) for time, website
+ in self.frontier])
+
+
+class Website(object):
+ def __init__(self, domain):
+ self.domain = domain
+ self.urls = []
+ self.robots = None
+
+ def is_allowed(self, url):
+ # TODO
+ return True
+
+ def add_url(self, url, found_via):
+ self.urls.append((url, found_via))
+
+ def next_url(self):
+ return self.urls.pop()