import requests
def scrapy(link):
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
}
r = requests.get(link, headers= headers)
return (r.text)
link = "https://api.zhihu.com/lives/homefeed?includes=live"
html = scrapy(link)
print (html)
# In[3]:
import json
decodejson = json.loads(html)
next_page = decodejson['paging']['next']
is_end = decodejson['paging']['is_end']
print (next_page)
print (is_end)
import requests
from pymongo import MongoClient
import json
import time
import random
#连接MongoDB
client = MongoClient('localhost',27017)
db = client.zhihu_database
collection = db.live
#定义爬虫函数
def scrapy(link):
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
}
r = requests.get(link, headers= headers)
return (r.text)
link = "https://api.zhihu.com/lives/homefeed?includes=live"
is_end = False
#循环获取所有Live
while not is_end:
html = scrapy(link)
decodejson = json.loads(html)
collection.insert_one(decodejson)
link = decodejson['paging']['next']
is_end = decodejson['paging']['is_end']
print (link, is_end)
time.sleep(random.randint(2,3) + random.random())
from pymongo import MongoClient
client = MongoClient('localhost',27017)
db = client.zhihu_database
collection = db.live
first_page = collection.find_one()
for each in first_page['data']:
print (each['live']['id'])
import requests
from pymongo import MongoClient
import json
import time
import random
client = MongoClient('localhost',27017)
db = client.zhihu_database
live_id = '989811253094866944'
def get_audience(live_id):
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'
}
link = 'https://api.zhihu.com/lives/' + live_id + '/members?limit=10&offset=0'
is_end = False
while not is_end:
r = requests.get(link, headers= headers)
html = r.text
decodejson = json.loads(html)
decodejson['live_id'] = live_id
db.live_audience.insert_one(decodejson)
link = decodejson['paging']['next']
is_end = decodejson['paging']['is_end']
print(link, is_end)
time.sleep(random.randint(2,3) + random.random())
get_audience(live_id)
import requests
from pymongo import MongoClient
import json
import time
import random
client = MongoClient('localhost',27017)
db = client.zhihu_database
for each_page in db.live.find():
for each in each_page['data']:
live_id = each['live']['id']
print (live_id)
get_audience(live_id)