RAYBET雷竞技-最佳电子竞技即时竞猜平台

爬取努努书房小说网站程序

import requests

import re

import os

start_url = 'https://www.kanunu8.com/files/old/2011/2512.html'

'''

功能：通过网页源代码

url：输入网址

'''

def get_source(url):

html = requests.get(url)

return html.content.decode('gbk')

'''

功能：获取每章节网址

html：网页源代码

'''

def get_toc(html):

toc_url_list = []

toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0]

toc_url = re.findall('href="(.*?)"', toc_block, re.S)

for url in toc_url:

toc_url_list.append(start_url.replace('2512.html',url))

return toc_url_list

'''

功能：获取每章节名称和内容

chaper_url：章节对一个的网址

'''

def get_article(chaper_url):

chaper_html = get_source(chaper_url)

chapter_name = re.findall('(.*?)', chaper_html, re.S)[0]

text_block = re.findall('(.*?)', chaper_html, re.S)[0]

text_block = text_block.replace(' ', '')

return chapter_name, text_block

'''

功能：把章节内容写到指定目录

chapter_name：章节名字

text_block：章节内容

'''

def save(chapter_name, text_block):

os.makedirs('警世通言', exist_ok=True)

with open(os.path.join('警世通言', chapter_name + '.txt'), 'w', encoding='utf-8') as f:

f.write(text_block)

'''

功能：把所有章节内容写到指定目录

url：目录网址，如：'https://www.kanunu8.com/files/old/2011/2512.html'

'''

def exe_artile(url):

toc_url_list = get_toc(get_source(url))

for chaper_url in toc_url_list:

chapter_name, text_block = get_article(chaper_url)

save(chapter_name,text_block)

#测试

exe_artile(start_url)