用Scrapy、Xpath等实现特定数据的爬取 --爬取古诗文网

程序员281

701人浏览 · 2025-05-29 10:28:46

程序员281 · 2025-05-29 10:28:46 发布

使用Scrapy框架编写爬虫程序，爬取古诗文网（https://so.gushiwen.cn/mingjus/）网站中的每个名句及完整古诗词内容（有手就行）

一、项目初始化

创建Scrapy项目

scrapy startproject gushiwen
cd gushiwen
scrapy genspider mingju so.gushiwen.cn

二、修改items.py（数据模型）

import scrapy

class PoemItem(scrapy.Item):
    # 名句信息
    famous_quote = scrapy.Field()      # 名句内容
    quote_source = scrapy.Field()      # 名句出处
    
    # 完整诗词信息
    poem_title = scrapy.Field()        # 诗词标题
    poem_author = scrapy.Field()       # 诗词作者
    poem_content = scrapy.Field()      # 诗词全文
    poem_dynasty = scrapy.Field()      # 创作朝代
    
    # 元信息
    url = scrapy.Field()               # 来源URL
    crawl_time = scrapy.Field()        # 爬取时间

三、爬虫实现（spiders/mingju.py）

import scrapy
from gushiwen.items import PoemItem
from datetime import datetime
import re

class MingjuSpider(scrapy.Spider):
    name = 'mingju'
    allowed_domains = ['gushiwen.cn']
    start_urls = ['https://so.gushiwen.cn/mingjus/default.aspx']
    
    custom_settings = {
        'CONCURRENT_REQUESTS': 2,
        'DOWNLOAD_DELAY': 1.5,
        'DEFAULT_REQUEST_HEADERS': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        }
    }

    def parse(self, response):
        # 解析名句列表页
        for item in response.xpath('//div[@class="sons"]/div[@class="cont"]'):
            poem_item = PoemItem()
            
            # 提取名句信息
            poem_item['famous_quote'] = item.xpath('./a[1]/text()').get().strip()
            poem_item['quote_source'] = item.xpath('./a[2]/text()').get().strip()
            
            # 获取详情页链接
            detail_url = response.urljoin(item.xpath('./a[1]/@href').get())
            poem_item['url'] = detail_url
            
            yield scrapy.Request(
                detail_url,
                callback=self.parse_poem_detail,
                meta={'item': poem_item}
            )
        
        # 分页处理（示例爬取前3页）
        next_page = response.xpath('//a[@class="amore" and contains(text(),"下一页")]/@href').get()
        if next_page and "page=3" not in next_page:  # 控制爬取页数
            yield response.follow(next_page, self.parse)

    def parse_poem_detail(self, response):
        item = response.meta['item']
        item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        # 提取诗词标题
        item['poem_title'] = response.xpath('//div[@class="cont"]/h1/text()').get().strip()
        
        # 提取作者和朝代
        author_info = response.xpath('//p[@class="source"]/a/text()').getall()
        if len(author_info) >= 2:
            item['poem_author'] = author_info[0]
            item['poem_dynasty'] = re.sub(r'[\[\]]', '', author_info[1])
        
        # 提取诗词内容
        content = ''.join(response.xpath('//div[@class="contson"]//text()').getall())
        item['poem_content'] = re.sub(r'\s+', ' ', content).strip()
        
        yield item

四、数据存储管道（pipelines.py）

1. 文本文件存储

import json
from itemadapter import ItemAdapter

class JsonWriterPipeline:
    def open_spider(self, spider):
        self.file = open('poems_data.jsonl', 'a', encoding='utf-8')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

2. MySQL存储

import pymysql
from itemadapter import ItemAdapter
from datetime import datetime

class MySQLPipeline:
    def __init__(self):
        self.conn = None
        self.cursor = None

    def open_spider(self, spider):
        self.conn = pymysql.connect(
            host='localhost',
            user='your_username',
            password='your_password',
            database='poetry_db',
            charset='utf8mb4'
        )
        self.cursor = self.conn.cursor()
        self.create_table()

    def create_table(self):
        self.cursor.execute("""
        CREATE TABLE IF NOT EXISTS famous_poems (
            id INT AUTO_INCREMENT PRIMARY KEY,
            famous_quote TEXT NOT NULL,
            quote_source VARCHAR(255),
            poem_title VARCHAR(255) NOT NULL,
            poem_author VARCHAR(100),
            poem_dynasty VARCHAR(50),
            poem_content LONGTEXT NOT NULL,
            source_url VARCHAR(512),
            crawl_time DATETIME,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            UNIQUE KEY (famous_quote(100), poem_title(100))
            ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
        """)
        self.conn.commit()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""
                INSERT INTO famous_poems (
                    famous_quote, quote_source, poem_title,
                    poem_author, poem_dynasty, poem_content,
                    source_url, crawl_time
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE poem_content=VALUES(poem_content)
            """, (
                item['famous_quote'],
                item['quote_source'],
                item['poem_title'],
                item['poem_author'],
                item['poem_dynasty'],
                item['poem_content'],
                item['url'],
                item['crawl_time']
            ))
            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
            raise e
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

五、配置settings.py

BOT_NAME = 'gushiwen'

SPIDER_MODULES = ['gushiwen.spiders']
NEWSPIDER_MODULE = 'gushiwen.spiders'

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1.5
CONCURRENT_REQUESTS_PER_DOMAIN = 2

ITEM_PIPELINES = {
    'gushiwen.pipelines.JsonWriterPipeline': 300,
    'gushiwen.pipelines.MySQLPipeline': 400,
}

# 日志设置
LOG_LEVEL = 'INFO'
LOG_FILE = 'gushiwen.log'

六、运行爬虫

scrapy crawl mingju -o output.json

七、MySQL表结构说明

CREATE TABLE `famous_poems` (
  `id` int NOT NULL AUTO_INCREMENT,
  `famous_quote` text COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '名句内容',
  `quote_source` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '名句出处',
  `poem_title` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '诗词标题',
  `poem_author` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '作者',
  `poem_dynasty` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '朝代',
  `poem_content` longtext COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '诗词全文',
  `source_url` varchar(512) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '来源URL',
  `crawl_time` datetime DEFAULT NULL COMMENT '爬取时间',
  `created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
  PRIMARY KEY (`id`),
  UNIQUE KEY `famous_quote` (`famous_quote`(100),`poem_title`(100))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

DAMO开发者矩阵

DAMO开发者矩阵，由阿里巴巴达摩院和中国互联网协会联合发起，致力于探讨最前沿的技术趋势与应用成果，搭建高质量的交流与分享平台，推动技术创新与产业应用链接，围绕“人工智能与新型计算”构建开放共享的开发者生态。

更多推荐

Robust World Models for Embodied AI: Unifying Equivariance with Adaptive Spectral Filtering

摘要（150字）：本文提出了一种面向具身智能的鲁棒世界模型框架，通过球谐图神经网络（SH-GNN）与自适应频谱滤波在"大脑+小脑"认知架构中的统一，实现了多维度物理建模的突破。核心创新包括：(1) 三行代码实现的SO(3)等变消息传递算子，统一处理1D/2D/3D数据；(2) 基于Parseval能量截断的零参数频谱去噪机制，信噪比提升5-40dB；(3) 跨58个物理领域的实验表明，3D点云具