员工编写的爬取知乎数据

import requests


def scrapy(link):

   headers = {

       'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'

   }

   r = requests.get(link, headers= headers)

   return (r.text)


link = "https://api.zhihu.com/lives/homefeed?includes=live"

html = scrapy(link)

print (html)



# In[3]:



import json

decodejson = json.loads(html)

next_page = decodejson['paging']['next']

is_end = decodejson['paging']['is_end']

print (next_page)

print (is_end)





import requests

from pymongo import MongoClient

import json

import time

import random


#连接MongoDB

client = MongoClient('localhost',27017)

db = client.zhihu_database

collection = db.live


#定义爬虫函数

def scrapy(link):

   headers = {

       'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'

   }

   r = requests.get(link, headers= headers)

   return (r.text)


link = "https://api.zhihu.com/lives/homefeed?includes=live"

is_end = False

#循环获取所有Live

while not is_end:

   html = scrapy(link)

   decodejson = json.loads(html)

   collection.insert_one(decodejson)


   link = decodejson['paging']['next']

   is_end = decodejson['paging']['is_end']

   print (link, is_end)

   time.sleep(random.randint(2,3) + random.random())





from pymongo import MongoClient

client = MongoClient('localhost',27017)

db = client.zhihu_database

collection = db.live


first_page = collection.find_one()

for each in first_page['data']:

   print (each['live']['id'])




import requests

from pymongo import MongoClient

import json

import time

import random


client = MongoClient('localhost',27017)

db = client.zhihu_database


live_id = '989811253094866944'


def get_audience(live_id):

   headers = {

       'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'

   }

   link = 'https://api.zhihu.com/lives/' + live_id + '/members?limit=10&offset=0'


   is_end = False

   while not is_end:

       r = requests.get(link, headers= headers)

       html = r.text

       decodejson = json.loads(html)

       decodejson['live_id'] = live_id

       db.live_audience.insert_one(decodejson)


       link = decodejson['paging']['next']

       is_end = decodejson['paging']['is_end']

       print(link, is_end)

       time.sleep(random.randint(2,3) + random.random())


get_audience(live_id)




import requests

from pymongo import MongoClient

import json

import time

import random


client = MongoClient('localhost',27017)

db = client.zhihu_database


for each_page in db.live.find():

   for each in each_page['data']:

       live_id = each['live']['id']

       print (live_id)        

       get_audience(live_id)