crawl.py

阚小云

2023-12-01

#!/usr/bin/env python from sys import argv from os import makedirs, unlink, sep from os.path import isdir, exists, dirname, splitext from string import replace, find, lower from htmllib import HTMLParser from urllib import urlretrieve from urlparse import urlparse, urljoin from formatter import DumbWriter, AbstractFormatter from cStringIO import StringIO class Retriever(object): def __init__(self, url): self.url = url self.file = self.filename(url) #self.file = 'www.hao123.com/index.htm' def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) #url = 'http://www.hao123.com/index.htm' print 'parsedurl = ', parsedurl path = parsedurl[1] + parsedurl[2] #path = 'www.hao123.com/index.htm' print 'path = ', path ext = splitext(path) #ext = ('www.hao123.com/index', '.htm') print 'ext = ', ext if ext[1] == '': if path[-1] == '/': path += deffile else: path += '/' + deffile print 'path = ', path ldir = dirname(path) # local directory ldir = 'www.hao123.com' print 'ldir = ', ldir if sep != '/': # os-indep. path separator ldir = replace(ldir, ',', sep) print 'ldir = ', ldir if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) makedirs(ldir) # create ''www.hao123.com' directory print 'path = ', path return path def download(self): # download Web page try: print 'self.url = ', self.url print 'self.file = ', self.file retval = urlretrieve(self.url, self.file) print 'retval = ', retval except IOError: retval = ('*** ERROR: invalid URL "%s"' % / self.url, ) return retval def parseAndGetLinks(self): # pars HTML, save links self.parser = HTMLParser(AbstractFormatter( / DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class Crawler(object): # manage entire crawling process count = 0 # static downloaded page counter def __init__(self, url): self.q = [url] self.seen = [] self.dom = urlparse(url)[1] def getPage(self, url): r = Retriever(url) retval = r.download() if retval[0] == '*': # error situation, do not parse print retval, '... skipping parse' return Crawler.count = Crawler.count + 1 print '/n(', Crawler.count, ')' print 'URL:', url print 'FILE:', retval[0] self.seen.append(url) links = r.parseAndGetLinks() # get and process links for eachLink in links: if eachLink[:4] != 'http' and / find(eachLink, '://') == -1: eachLink = urljoin(url, eachLink) print '* ', eachLink, ' '*((60 - len(eachLink)) if len(eachLink) < 60 else 0), if find(lower(eachLink), 'mailto:') != -1 or find(eachLink, 'javascript:') != -1: print '... discarded, javascript' continue if eachLink[-1] == '/': #discarded such http://www.hao123.com/ eachLink = eachLink[:-1] if eachLink not in self.seen: if find(eachLink, self.dom) == -1: print '... discarded, not in domain' else: if eachLink not in self.q: self.q.append(eachLink) print '... new, added to Q' else: print '... discarded, already in Q' else: print '... discarded, already processed' def go(self): # process links in queue while self.q: if Crawler.count > 3: break url = self.q.pop() self.getPage(url) def main(): if len(argv) > 1: url = argv[1] else: try: url = raw_input('Enter starting URL: ') except (KeyboardInterrupt, EOFError): url = '' if not url: return robot = Crawler(url) robot.go() if __name__ == '__main__': main()

>crawl.py http://www.hao123.com/index.htm

结果如下：

parsedurl = ParseResult(scheme='http', netloc='www.hao123.com', path='/index.htm', params='', query='', fragment='')
path = www.hao123.com/index.htm
ext = ('www.hao123.com/index', '.htm')
path = www.hao123.com/index.htm
ldir = www.hao123.com
ldir = www.hao123.com
path = www.hao123.com/index.htm
self.url = http://www.hao123.com/index.htm
self.file = www.hao123.com/index.htm
retval = ('www.hao123.com/index.htm', <httplib.HTTPMessage instance at 0x010F9968>)

( 1 )
URL: http://www.hao123.com/index.htm
FILE: www.hao123.com/index.htm
* http://www.hao123.com                                         ... new, added to Q
* http://www.hao123.com/redian/tongzhi.htm                      ... new, added to Q
* http://utility.hao123.com/quality_form.php                    ... discarded, not in domain
* javascript:void(0)                                            ... discarded, javascript
* http://www.hao123.com/redian/scookie.htm                      ... new, added to Q
* javascript:void(0)                                            ... discarded, javascript
* javascript:void(0)                                            ... discarded, javascript
* javascript:void(0)                                            ... discarded, javascript
* http://www.hao123.com                                         ... discarded, already in Q
* http://wenku.baidu.com                                        ... discarded, not in domain
* http://baike.baidu.com                                        ... discarded, not in domain
* http://jingyan.baidu.com                                      ... discarded, not in domain
* http://hi.baidu.com                                           ... discarded, not in domain
* http://top.baidu.com                                          ... discarded, not in domain
* http://dict.baidu.com                                         ... discarded, not in domain
* http://s.baidu.com                                            ... discarded, not in domain
* http://www.baidu.com                                          ... discarded, not in domain
* http://www.hao123.com/daquan/shfwsite.htm                     ... new, added to Q
* http://www.hao123.com/netbuy.htm                              ... new, added to Q
* http://www.hao123.com/caipiao.htm                             ... new, added to Q
* http://www.hao123.com/haoserver/index.htm                     ... new, added to Q
* http://www.hao123.com/tianqi.htm                              ... new, added to Q
* http://www.hao123.com/stock.htm                               ... new, added to Q
* http://www.hao123.com/stock3.htm                              ... new, added to Q
* http://www.hao123.com/bankjt.htm                              ... new, added to Q
* http://www.hao123.com/lvyou.htm                               ... new, added to Q

..........

crawl.py

相关阅读

相关文章

相关问答