python 爬取某个小说网站

python 爬取某个小说网站

import os
import json
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re

# 定义分类列表
bookcat = [
    'xuanhuan',
    'xianxia',
    'dushi',
    'lishi',
    'youxi',
    'kehuan',
    'kongbu',
    'qita'
]

# 定义基础URL
base_url = "https://www.xiaoshuo.run/list/"

# 定义每页小说数量和页码范围(可根据实际情况调整)
novels_per_page = 10
total_pages = 20  # 假设每个分类有20页


# 创建文件夹函数
def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"创建文件夹: {folder_name}")


# 获取小说信息函数
def get_novel_info(detail_url, novel_folder):
    try:
        print(f"正在获取小说详情: {detail_url}")
        response = requests.get(detail_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        novel_info = {}

        info_div = soup.find('div', {'id': 'info'})
        if info_div:
            novel_info['name'] = info_div.find('h1').text

            # 获取小说图片链接
            sidebar_div = soup.find('div', {'id': 'sidebar'})
            img_tag = sidebar_div.find('img') if sidebar_div else None
            novel_info['images'] = img_tag['src'] if img_tag and 'src' in img_tag.attrs else ""

            # 获取文章ID
            article_id_match = re.search(r'articleid=(\d+)', detail_url)
            novel_info['urlid'] = article_id_match.group(1) if article_id_match else ""

            # 获取下载链接
            download_link = soup.find('a', {'rel': 'nofollow'})
            novel_info['txt_down'] = urljoin(detail_url, download_link[
                'href']) if download_link and 'href' in download_link.attrs else ""

            # 添加小说信息到JSON文件
            info_file = os.path.join(novel_folder, 'info.json')
            with open(info_file, 'w', encoding='utf-8') as f:
                json.dump(novel_info, f, ensure_ascii=False, indent=4)

            print(f"小说信息已保存: {info_file}")

            return novel_info
        else:
            print(f"无法找到小说详情: {detail_url}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"请求小说详情页失败: {detail_url}, 错误: {e}")
        return None
    except Exception as e:
        print(f"获取小说信息时出错: {detail_url}, 错误: {e}")
        return None


# 下载章节内容函数
def download_chapters(novel_folder, novel_info):
    try:
        print(f"正在下载章节内容")
        download_url = novel_info.get('txt_down', '')
        if not download_url:
            print("无法找到下载链接")
            return

        response = requests.get(download_url)
        response.raise_for_status()
        content = response.text

        # 创建章节文件夹
        chapter_folder = os.path.join(novel_folder, 'chapters')
        create_folder_if_not_exists(chapter_folder)

        # 保存章节内容
        file_path = os.path.join(chapter_folder, 'chapter.txt')
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

        print(f"章节内容已保存: {file_path}")
        print(f"======================================")
    except requests.exceptions.RequestException as e:
        print(f"下载章节内容失败, 错误: {e}")
    except Exception as e:
        print(f"保存章节内容时出错, 错误: {e}")


# 主爬虫函数
def scrape_and_collect():
    for category in bookcat:
        # 动态设置保存文件夹
        BOOK_FOLDER = os.path.join("novels", category)
        create_folder_if_not_exists(BOOK_FOLDER)

        # 遍历分页
        for page in range(1, total_pages + 1):
            url = f"{base_url}{category}/{page}"
            print(f"\n正在处理分类: {category}, 第 {page} 页: {url}")

            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                novel_list = soup.find('div', {'id': 'newscontent'}).find_all('li')

                print(f"已提取小说列表,共 {len(novel_list)} 本小说")

                for novel in novel_list:
                    a_tags = novel.find_all('a')
                    if not a_tags:
                        continue

                    # 获取小说详情页的链接
                    detail_url = a_tags[0]['href']
                    if not detail_url.startswith('http'):
                        detail_url = urljoin(url, detail_url)

                    # 获取小说名称,用于文件夹命名
                    novel_name = a_tags[0].text
                    # 将小说保存到对应分类文件夹中
                    novel_folder = os.path.join(BOOK_FOLDER, novel_name.replace('/', '_'))  # 避免文件名中包含非法字符
                    create_folder_if_not_exists(novel_folder)

                    # 获取小说信息并保存到info.json
                    novel_info = get_novel_info(detail_url, novel_folder)
                    if novel_info:
                        # 下载章节内容
                        download_chapters(novel_folder, novel_info)

            except requests.exceptions.RequestException as e:
                print(f"请求页面失败: {url}, 错误: {e}")
            except Exception as e:
                print(f"处理页面时出错: {url}, 错误: {e}")

    print("\n采集完成")


if __name__ == "__main__":
    # 创建根文件夹
    create_folder_if_not_exists("novels")
    scrape_and_collect()
© 版权声明
THE END
喜欢就支持一下吧
点赞8 分享
评论 抢沙发

请登录后发表评论

    暂无评论内容