该网站为悬挂备案号,芜湖网络,自己做网站2008R2好还是win7,wordpress 仿煎蛋主题 无聊图爬虫案例—根据四大名著书名抓取并存储为文本文件
诗词名句网#xff1a;https://www.shicimingju.com
目标#xff1a;输入四大名著的书名#xff0c;抓取名著的全部内容#xff0c;包括书名#xff0c;作者#xff0c;年代及各章节内容
诗词名句网主页如下图#x…爬虫案例—根据四大名著书名抓取并存储为文本文件
诗词名句网https://www.shicimingju.com
目标输入四大名著的书名抓取名著的全部内容包括书名作者年代及各章节内容
诗词名句网主页如下图 今天的案例是抓取古籍板块下的四大名著如下图
案例源码如下
import time
import requests
from bs4 import BeautifulSoup
import randomheaders {user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, }# 获取响应页面,并返回实例化soup
def get_soup(html_url):res requests.get(html_url, headersheaders)res.encoding res.apparent_encodinghtml res.content.decode()soup BeautifulSoup(html, lxml)return soup# 返回名著的书名及对应的网址字典
def get_book_url(page_url):book_url_dic {}soup get_soup(page_url)div_tag soup.find(class_card booknark_card)title_lst div_tag.ul.find_all(nameli)for title in title_lst:book_url_dic[title.a.text.strip(《》)] https://www.shicimingju.com title.a[href]return book_url_dic# 输出每一章节内容
def get_chapter_content(chapter_url):chapter_content_lst []chapter_soup get_soup(chapter_url)div_chapter chapter_soup.find(class_card bookmark-list)chapter_content div_chapter.find_all(p)for p_content in chapter_content:chapter_content_lst.append(p_content.text)time.sleep(random.randint(1, 3))return chapter_content_lst# 主程序
if __name__ __main__:# 古籍板块链接gj_url https://www.shicimingju.com/bookurl_dic get_book_url(gj_url)mz_name input(请输入四大名著名称 )mz_url url_dic[mz_name]soup get_soup(mz_url)abbr_tag soup.find(class_card bookmark-list)book_name abbr_tag.h1.textf open(f{book_name}.txt, a, encodingutf-8)f.write(书名book_name\n)print(名著名称, book_name, end\n)p_lst abbr_tag.find_all(p)for p in p_lst:f.write(p.text\n)mulu_lst soup.find_all(class_book-mulu)book_ul mulu_lst[0].ulbook_li book_ul.find_all(nameli)for bl in book_li:print(\t\t, bl.text)chapter_url https://www.shicimingju.com bl.a[href]f.write(bl.text\n)f.write(.join(get_chapter_content(chapter_url))\n)f.close()