员工编写的爬取说说网站数据程序

import requests

from selenium import webdriver

import time

import json

url='https://i.qq.com/'

header={

   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44',

   'Cookie': '_qpsvr_localtk=0.7146312865549047; ptui_loginuin=18393135; pgv_pvid=8624709835; pgv_info=ssid=s8744790440; uin=o0018393135; skey=@18VUUFS11; RK=0AkcR+agMC; ptcz=df18e579ed50cfbb65aba3401083ba2932bcc1585ba56243806f89f674d69582; p_uin=o0018393135; pt4_token=9vjZxekMhyM*UCW7XKgTu7*IxzEAKpHetqI5dCBzklU_; p_skey=hUOk9itH92op08DoEoLq9O-cFt8Iog5ueztpD1za8o0_; Loading=Yes; __Q_w_s_hat_seed=1'}

driver_path=r"D:\chromedriver.exe"

browser=webdriver.Chrome(executable_path=driver_path)


browser.get(url)

#getGTK函数计算gtk

def getGTK(cookie):

   hashes = 5381

   for letter in cookie['p_skey']:

       hashes += (hashes << 5) + ord(letter)

   return hashes & 0x7fffffff

browser.switch_to.frame('login_frame')

browser.find_element_by_id('switcher_plogin').click()

browser.find_element_by_id('u').send_keys('18393135')

browser.find_element_by_id('p').send_keys('iloveyou810203')

browser.find_element_by_id('login_button').click()

time.sleep(20)

cookie = {}  # 初始化cookie字典

cookies = browser.get_cookies()

print(cookies)

for elem in cookies:  # 取cookies

   cookie[elem['name']] = elem['value']

# print(cookie)

url1='https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6?uin=18393135&ftype=0&sort=0&pos=0&num=20&replynum=100&g_tk=1093297420&callback=_preloadCallback&code_version=1&format=jsonp&need_private_comment=1&g_tk='+str(getGTK(cookie))

response=requests.get(url1,headers=header)

print(response.url)

print(response.content.decode('utf-8'))

response_fix=response.content.decode('utf-8')[17:-2]#从第17个开始取(0开始),取到倒数第二个之前。a[i:-j]这里就是从下标i取到倒数第j个下标之前(不包括倒数第j个下标位置的元素)

jsonjson_bodyody = json.loads(response_fix)

print(jsonjson_bodyody)

msglist = jsonjson_bodyody['msglist']

print(msglist)

print('*'*20)

ss_list=[]

if msglist is not None:

   for msg in msglist:

       content= msg['content']

       # 转换成localtime

       time_local = time.localtime(int(msg['created_time']))

       # 转换成新的时间格式(XXXX-XX-XX XX:XX:XX)

       dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)

       created_time = dt

       item={

           'content':content,

           'created_time':created_time

       }

       ss_list.append(item)

print(ss_list)