爬取努努书房小说网站程序

import requests

import re

import os


start_url = 'https://www.kanunu8.com/files/old/2011/2512.html'


'''

功能:通过网页源代码

url:输入网址

'''

def get_source(url):

   html = requests.get(url)

   return html.content.decode('gbk')


'''

功能:获取每章节网址

html:网页源代码

'''

def get_toc(html):

   toc_url_list = []

   toc_block = re.findall('正文(.*?)</tbody>', html, re.S)[0]

   toc_url = re.findall('href="(.*?)"', toc_block, re.S)


   for url in toc_url:

       toc_url_list.append(start_url.replace('2512.html',url))

   return toc_url_list


'''

功能:获取每章节名称和内容

chaper_url:章节对一个的网址

'''

def get_article(chaper_url):

   chaper_html = get_source(chaper_url)

   chapter_name = re.findall('<font color="#dc143c">(.*?)</font>', chaper_html, re.S)[0]

   text_block = re.findall('<p>(.*?)</p>', chaper_html, re.S)[0]

   text_block = text_block.replace('<br />', '')

   return chapter_name, text_block


'''

功能:把章节内容写到指定目录

chapter_name:章节名字

text_block:章节内容

'''

def save(chapter_name, text_block):

   os.makedirs('警世通言', exist_ok=True)

   with open(os.path.join('警世通言', chapter_name + '.txt'), 'w', encoding='utf-8') as f:

       f.write(text_block)


'''

功能:把所有章节内容写到指定目录

url:目录网址,如:'https://www.kanunu8.com/files/old/2011/2512.html'

'''

def exe_artile(url):

   toc_url_list = get_toc(get_source(url))

   for chaper_url in toc_url_list:

       chapter_name, text_block = get_article(chaper_url)

       save(chapter_name,text_block)



#测试

exe_artile(start_url)