import os
import json
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re
# 定义分类列表
bookcat = [
'xuanhuan',
'xianxia',
'dushi',
'lishi',
'youxi',
'kehuan',
'kongbu',
'qita'
]
# 定义基础URL
base_url = "https://www.xiaoshuo.run/list/"
# 定义每页小说数量和页码范围(可根据实际情况调整)
novels_per_page = 10
total_pages = 20 # 假设每个分类有20页
# 创建文件夹函数
def create_folder_if_not_exists(folder_name):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
print(f"创建文件夹: {folder_name}")
# 获取小说信息函数
def get_novel_info(detail_url, novel_folder):
try:
print(f"正在获取小说详情: {detail_url}")
response = requests.get(detail_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
novel_info = {}
info_div = soup.find('div', {'id': 'info'})
if info_div:
novel_info['name'] = info_div.find('h1').text
# 获取小说图片链接
sidebar_div = soup.find('div', {'id': 'sidebar'})
img_tag = sidebar_div.find('img') if sidebar_div else None
novel_info['images'] = img_tag['src'] if img_tag and 'src' in img_tag.attrs else ""
# 获取文章ID
article_id_match = re.search(r'articleid=(\d+)', detail_url)
novel_info['urlid'] = article_id_match.group(1) if article_id_match else ""
# 获取下载链接
download_link = soup.find('a', {'rel': 'nofollow'})
novel_info['txt_down'] = urljoin(detail_url, download_link[
'href']) if download_link and 'href' in download_link.attrs else ""
# 添加小说信息到JSON文件
info_file = os.path.join(novel_folder, 'info.json')
with open(info_file, 'w', encoding='utf-8') as f:
json.dump(novel_info, f, ensure_ascii=False, indent=4)
print(f"小说信息已保存: {info_file}")
return novel_info
else:
print(f"无法找到小说详情: {detail_url}")
return None
except requests.exceptions.RequestException as e:
print(f"请求小说详情页失败: {detail_url}, 错误: {e}")
return None
except Exception as e:
print(f"获取小说信息时出错: {detail_url}, 错误: {e}")
return None
# 下载章节内容函数
def download_chapters(novel_folder, novel_info):
try:
print(f"正在下载章节内容")
download_url = novel_info.get('txt_down', '')
if not download_url:
print("无法找到下载链接")
return
response = requests.get(download_url)
response.raise_for_status()
content = response.text
# 创建章节文件夹
chapter_folder = os.path.join(novel_folder, 'chapters')
create_folder_if_not_exists(chapter_folder)
# 保存章节内容
file_path = os.path.join(chapter_folder, 'chapter.txt')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"章节内容已保存: {file_path}")
print(f"======================================")
except requests.exceptions.RequestException as e:
print(f"下载章节内容失败, 错误: {e}")
except Exception as e:
print(f"保存章节内容时出错, 错误: {e}")
# 主爬虫函数
def scrape_and_collect():
for category in bookcat:
# 动态设置保存文件夹
BOOK_FOLDER = os.path.join("novels", category)
create_folder_if_not_exists(BOOK_FOLDER)
# 遍历分页
for page in range(1, total_pages + 1):
url = f"{base_url}{category}/{page}"
print(f"\n正在处理分类: {category}, 第 {page} 页: {url}")
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
novel_list = soup.find('div', {'id': 'newscontent'}).find_all('li')
print(f"已提取小说列表,共 {len(novel_list)} 本小说")
for novel in novel_list:
a_tags = novel.find_all('a')
if not a_tags:
continue
# 获取小说详情页的链接
detail_url = a_tags[0]['href']
if not detail_url.startswith('http'):
detail_url = urljoin(url, detail_url)
# 获取小说名称,用于文件夹命名
novel_name = a_tags[0].text
# 将小说保存到对应分类文件夹中
novel_folder = os.path.join(BOOK_FOLDER, novel_name.replace('/', '_')) # 避免文件名中包含非法字符
create_folder_if_not_exists(novel_folder)
# 获取小说信息并保存到info.json
novel_info = get_novel_info(detail_url, novel_folder)
if novel_info:
# 下载章节内容
download_chapters(novel_folder, novel_info)
except requests.exceptions.RequestException as e:
print(f"请求页面失败: {url}, 错误: {e}")
except Exception as e:
print(f"处理页面时出错: {url}, 错误: {e}")
print("\n采集完成")
if __name__ == "__main__":
# 创建根文件夹
create_folder_if_not_exists("novels")
scrape_and_collect()
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END








暂无评论内容