爬虫:获取知乎日报数据


参考原文:http://me.ivydom.com/archives/python-zhihudaily.html

原代码如下:

import urllib.request as http
import json
import re

class APIMgr:
    def __init__(self):
        self.apiLatest='http://news-at.zhihu.com/api/3/news/latest'
        self.apiBefore='http://news.at.zhihu.com/api/3/news/before/'
        self.apiID='http://news-at.zhihu.com/api/3/news/'

        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = { 'User-Agent' : self.user_agent }

    def access(self,url):
        self.req=http.Request(url,headers=self.headers)
        self.fp=http.urlopen(self.req)
        self.mybytes=self.fp.read()
        self.mystr=self.mybytes.decode("utf8")
        self.fp.close()
        return json.loads(self.mystr)
    def getLatestNews(self):
        return self.access(self.apiLatest)
    def getBeoreNews(self,date):
        return self.access(self.apiBefore+date)
    def getAssignedNews(self,id_):
        return self.access(self.apiID+id_)
    def parse(self):
        return json.loads(self.a)

class NewsMgr:
    def __init__(self,newsData):
        self.data=newsData
    def getType(self):
        return self.data['type']
    def getID(self):
        return self.data['id']
    def __repr__(self):
        return self.data

class SingleNews(NewsMgr):
    def  __init__(self,apimgr,id_):
        NewsMgr.__init__(self,apimgr.getAssignedNews(id_))
    def getImages(self):
        return self.data['image']
    def getTitle(self):
        return self.data['title']
    def __len__(self):
        return len(self.getBody())
    def getImageSource(self):
        return self.data['date']
    def getGaOrefix(self):
        return self.data['ga_prefix']
    def getBody(self):
        return self.data['body']
    def getShareUrl(self):
        return self.data['share_url']
    def getCSS(self):
        return self.data['css'][0]

class AllDateNews(NewsMgr):
    def __init__(self,apimgr,date=''):
        if len(date)==0:
            self.preData=apimgr.getLatestNews()
            NewsMgr.__init__(self,self.preData['stories'])
        elif self.dateIsLegal(date):
            self.preData=apimgr.getBeforeNews()
            NewsMgr.__init__(self,self.preData['stories'])
        else:
            raise 'DateError:date is not legal'
    def getImages(self):
        return self.data[0]['images'][0]
    def getNewsNum(self):
        return len(self.data)
    def getDate(self):
        return self.preData['date']
    def getStories(self):
        return self.data
    def __len__(self):
        return self.getNewsNum()
    def dateIsLegal(self,date):
        return re.match(r'(?!0000)[0-9]{4}[0-9]{4}',date)
    def getTitle(self,index):
        return self.data[index]['title']
    def getID(self,index):
        return self.data[index]['id']

if __name__=='__main__':
    apimgr=APIMgr()
    print('-'*100)
    print('知乎日报ID为4497219的文章题目是:')
    single=SingleNews(apimgr,'4497219')
    print('    ',single.getTitle())
    print('知乎日报ID为4497219的图片地址是:')
    print('    ',single.getImages())
    print('-'*100)
    allnews=AllDateNews(apimgr)
    print('今日的知乎日报消息有:',len(allnews),'条')
    for index in range(len(allnews)):
        print('    标题:',allnews.getTitle(index),'ID:',allnews.getID(index))

为了便于学习和理解,我们对所有代码都加了注释说明,如下:

import urllib.request as http 
#导入urllib库中的request模块,且使用别名http
#urllib提供了一系列用于操作URL的功能,request模块可以方便地抓取URL内容

import json 
#导入json模块

import re 
#导入re模块,提供对正则表达式的支持

class APIMgr:
#创建名称为APIMgr的类用来管理和访问知乎日报API,类名通常都是大写开头的单词

    def __init__(self):
    #定义__init__方法
    #对于python中的方法和函数一个简单的理解是:在类中定义的函数称为方法
    #在类中定义的方法,第一个参数永远是self,且调用时,不需要传递该参数

        self.apiLatest='http://news-at.zhihu.com/api/3/news/latest'
        self.apiBefore='http://news.at.zhihu.com/api/3/news/before/'
        self.apiID='http://news-at.zhihu.com/api/3/news/'

        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = { 'User-Agent' : self.user_agent }
        #知乎日报禁止了爬虫,因此要模拟浏览器访问

    def access(self,url):
        self.req=http.Request(url,headers=self.headers)
        #执行单次http请求
        #url:一个以http或https开头的资源定位符字串,必须是一个绝对的地址
        #headers:请求头信息

        self.fp=http.urlopen(self.req)
        #创建一个表示远程url的类文件对象,从而可以操作这个类文件对象来获取远程数据

        self.mybytes=self.fp.read()
        #read()是urlopen所创建的类下的一个方法

        self.mystr=self.mybytes.decode("utf8")
        #将参数串转为utf8编码

        self.fp.close()
        #关闭类文件对象

        return json.loads(self.mystr)
        #把json格式字符串解码转换成python对象

    #能力有限,下面的实在弄不懂了,等我学习一阵子再回来(如果我还记得的话...)

    def getLatestNews(self):
        return self.access(self.apiLatest)
    def getBeoreNews(self,date):
        return self.access(self.apiBefore+date)
    def getAssignedNews(self,id_):
        return self.access(self.apiID+id_)
    def parse(self):
        return json.loads(self.a)

class NewsMgr:
    def __init__(self,newsData):
        self.data=newsData
    def getType(self):
        return self.data['type']
    def getID(self):
        return self.data['id']
    def __repr__(self):
        return self.data

class SingleNews(NewsMgr):
    def  __init__(self,apimgr,id_):
        NewsMgr.__init__(self,apimgr.getAssignedNews(id_))
    def getImages(self):
        return self.data['image']
    def getTitle(self):
        return self.data['title']
    def __len__(self):
        return len(self.getBody())
    def getImageSource(self):
        return self.data['date']
    def getGaOrefix(self):
        return self.data['ga_prefix']
    def getBody(self):
        return self.data['body']
    def getShareUrl(self):
        return self.data['share_url']
    def getCSS(self):
        return self.data['css'][0]

class AllDateNews(NewsMgr):
    def __init__(self,apimgr,date=''):
        if len(date)==0:
            self.preData=apimgr.getLatestNews()
            NewsMgr.__init__(self,self.preData['stories'])
        elif self.dateIsLegal(date):
            self.preData=apimgr.getBeforeNews()
            NewsMgr.__init__(self,self.preData['stories'])
        else:
            raise 'DateError:date is not legal'
    def getImages(self):
        return self.data[0]['images'][0]
    def getNewsNum(self):
        return len(self.data)
    def getDate(self):
        return self.preData['date']
    def getStories(self):
        return self.data
    def __len__(self):
        return self.getNewsNum()
    def dateIsLegal(self,date):
        return re.match(r'(?!0000)[0-9]{4}[0-9]{4}',date)
    def getTitle(self,index):
        return self.data[index]['title']
    def getID(self,index):
        return self.data[index]['id']

if __name__=='__main__':
    apimgr=APIMgr()
    print('-'*100)
    print('知乎日报ID为4497219的文章题目是:')
    single=SingleNews(apimgr,'4497219')
    print('    ',single.getTitle())
    print('知乎日报ID为4497219的图片地址是:')
    print('    ',single.getImages())
    print('-'*100)
    allnews=AllDateNews(apimgr)
    print('今日的知乎日报消息有:',len(allnews),'条')
    for index in range(len(allnews)):
        print('    标题:',allnews.getTitle(index),'ID:',allnews.getID(index))