commit ad589588caefca329d4bbcd5778652f340a8e855
parent e332d0be7bfc6cf3d91de13b4bfc7584c3617a4d
Author: Stefan <cct@stefan-koch.name>
Date: Tue, 22 Jan 2013 21:59:11 +0100
add basic crawler
Diffstat:
A | crawler.py | | | 54 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 54 insertions(+), 0 deletions(-)
diff --git a/crawler.py b/crawler.py
@@ -0,0 +1,54 @@
+from BeautifulSoup import BeautifulSoup
+import urllib2, urlparse
+import re
+import time
+
+class Crawler(object):
+ def __init__(self, init_url):
+ self.pages = [init_url]
+ self.done = set()
+
+ self.url_match = None
+ self.wait_time = 0
+
+ def set_url_restrict(self, regexp):
+ self.url_match = re.compile(regexp)
+
+ def set_wait_time(self, seconds):
+ self.wait_time = seconds
+
+ def crawl(self):
+ while len(self.pages) > 0:
+ time.sleep(self.wait_time)
+
+ next_url = self.pages.pop()
+ print next_url
+
+ try:
+ html = self.visit_url(next_url)
+ except:
+ continue
+ finally:
+ self.done.add(next_url)
+
+ try:
+ for page in self.extract_urls(html):
+ page = urlparse.urljoin(next_url, page)
+
+ if not page in self.done and self.url_match.search(page):
+ self.pages.append(page)
+ except UnicodeEncodeError:
+ pass
+
+ def visit_url(self, url):
+ return urllib2.urlopen(url)
+
+ def extract_urls(self, page):
+ soup = BeautifulSoup(page)
+ return [link.get('href') for link in soup.findAll('a')]
+
+
+c = Crawler("http://stefan-koch.name/")
+c.set_url_restrict("http://stefan-koch.name/.+")
+c.set_wait_time(5)
+c.crawl()