import requests
import re
import os
start_url = 'https://www.kanunu8.com/files/old/2011/2512.html'
'''
功能:通过网页源代码
url:输入网址
'''
def get_source(url):
html = requests.get(url)
return html.content.decode('gbk')
'''
功能:获取每章节网址
html:网页源代码
'''
def get_toc(html):
toc_url_list = []
toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0]
toc_url = re.findall('href="(.*?)"', toc_block, re.S)
for url in toc_url:
toc_url_list.append(start_url.replace('2512.html',url))
return toc_url_list
'''
功能:获取每章节名称和内容
chaper_url:章节对一个的网址
'''
def get_article(chaper_url):
chaper_html = get_source(chaper_url)
chapter_name = re.findall('<font color="#dc143c">(.*?)</font>', chaper_html, re.S)[0]
text_block = re.findall('<p>(.*?)</p>', chaper_html, re.S)[0]
text_block = text_block.replace('<br />', '')
return chapter_name, text_block
'''
功能:把章节内容写到指定目录
chapter_name:章节名字
text_block:章节内容
'''
def save(chapter_name, text_block):
os.makedirs('警世通言', exist_ok=True)
with open(os.path.join('警世通言', chapter_name + '.txt'), 'w', encoding='utf-8') as f:
f.write(text_block)
'''
功能:把所有章节内容写到指定目录
url:目录网址,如:'https://www.kanunu8.com/files/old/2011/2512.html'
'''
def exe_artile(url):
toc_url_list = get_toc(get_source(url))
for chaper_url in toc_url_list:
chapter_name, text_block = get_article(chaper_url)
save(chapter_name,text_block)
#测试
exe_artile(start_url)