IT源码网

糗事百科python爬虫

xmjava 2021年02月16日 编程语言 412 0
# -*- coding: utf-8 -*- 
#coding=utf-8 
 
import urllib 
import urllib2 
import re 
import thread 
import time 
 
class QSBK: 
    def __init__(self): 
        self.pageIndex=1 
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
        self.header={'User-Agent':self.user_agent} 
        self.store=[] 
        self.enable=False 
    def getPage(self,pageIndex): 
        try: 
            url = 'http://www.qiushibaike.com/hot/page/'+str(pageIndex) 
            request = urllib2.Request(url,headers=self.header) 
            response = urllib2.urlopen(request) 
            pageHtml =response.read().decode('utf-8') 
            return pageHtml 
        except urllib2.URLError,e: 
            print '链接网络失败'+e.reason 
            return None 
    def getPageItem(self,pageIndex): 
        page = self.getPage(pageIndex) 
        if page==None: 
            print "页面获得失败" 
            return  None 
        pattern = re.compile('<div class="author.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?class="content.*?<span>\s*(.*?)\s*</span>',re.S) 
        items = re.findall(pattern, page) 
        pageStories = [] 
        for item in items: 
            pageStories.append([item[0],item[1]]) 
        return pageStories 
    def loadPage(self): 
        if self.enable==True: 
            if len(self.store)<2: 
                pageStories = self.getPageItem(self.pageIndex) 
                if pageStories!=None: 
                    self.store.append(pageStories) 
                    self.pageIndex+=1 
    def getOneStory(self,pageStories): 
        for story in pageStories: 
            input= raw_input() 
            self.loadPage() 
            if input=='Q': 
                self.enable=False 
                return 
            print u'%s %s'%(story[0],story[1]) 
    def start(self): 
        print u"正在读取糗事百科的数据,按Q退出" 
        self.enable=True 
        self.loadPage() 
        nowPage=0 
        while self.enable: 
            if len(self.store)>0: 
                pageStore=self.store[0] 
                nowPage+=1 
                del self.store[0] 
                self.getOneStory(pageStore) 
 
 
 
spider =QSBK() 
spider.start()

 

评论关闭
IT源码网

微信公众号号:IT虾米 (左侧二维码扫一扫)欢迎添加!