用Scrapy、Xpath等实现特定数据的爬取 --爬取古诗文网
·
使用Scrapy框架编写爬虫程序,爬取古诗文网(https://so.gushiwen.cn/mingjus/)网站 中的每个名句及完整古诗词内容(有手就行)
一、项目初始化
-
创建Scrapy项目
scrapy startproject gushiwen cd gushiwen scrapy genspider mingju so.gushiwen.cn
二、修改items.py(数据模型)
import scrapy
class PoemItem(scrapy.Item):
# 名句信息
famous_quote = scrapy.Field() # 名句内容
quote_source = scrapy.Field() # 名句出处
# 完整诗词信息
poem_title = scrapy.Field() # 诗词标题
poem_author = scrapy.Field() # 诗词作者
poem_content = scrapy.Field() # 诗词全文
poem_dynasty = scrapy.Field() # 创作朝代
# 元信息
url = scrapy.Field() # 来源URL
crawl_time = scrapy.Field() # 爬取时间
三、爬虫实现(spiders/mingju.py)
import scrapy
from gushiwen.items import PoemItem
from datetime import datetime
import re
class MingjuSpider(scrapy.Spider):
name = 'mingju'
allowed_domains = ['gushiwen.cn']
start_urls = ['https://so.gushiwen.cn/mingjus/default.aspx']
custom_settings = {
'CONCURRENT_REQUESTS': 2,
'DOWNLOAD_DELAY': 1.5,
'DEFAULT_REQUEST_HEADERS': {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
}
def parse(self, response):
# 解析名句列表页
for item in response.xpath('//div[@class="sons"]/div[@class="cont"]'):
poem_item = PoemItem()
# 提取名句信息
poem_item['famous_quote'] = item.xpath('./a[1]/text()').get().strip()
poem_item['quote_source'] = item.xpath('./a[2]/text()').get().strip()
# 获取详情页链接
detail_url = response.urljoin(item.xpath('./a[1]/@href').get())
poem_item['url'] = detail_url
yield scrapy.Request(
detail_url,
callback=self.parse_poem_detail,
meta={'item': poem_item}
)
# 分页处理(示例爬取前3页)
next_page = response.xpath('//a[@class="amore" and contains(text(),"下一页")]/@href').get()
if next_page and "page=3" not in next_page: # 控制爬取页数
yield response.follow(next_page, self.parse)
def parse_poem_detail(self, response):
item = response.meta['item']
item['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 提取诗词标题
item['poem_title'] = response.xpath('//div[@class="cont"]/h1/text()').get().strip()
# 提取作者和朝代
author_info = response.xpath('//p[@class="source"]/a/text()').getall()
if len(author_info) >= 2:
item['poem_author'] = author_info[0]
item['poem_dynasty'] = re.sub(r'[\[\]]', '', author_info[1])
# 提取诗词内容
content = ''.join(response.xpath('//div[@class="contson"]//text()').getall())
item['poem_content'] = re.sub(r'\s+', ' ', content).strip()
yield item
四、数据存储管道(pipelines.py)
1. 文本文件存储
import json
from itemadapter import ItemAdapter
class JsonWriterPipeline:
def open_spider(self, spider):
self.file = open('poems_data.jsonl', 'a', encoding='utf-8')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
2. MySQL存储
import pymysql
from itemadapter import ItemAdapter
from datetime import datetime
class MySQLPipeline:
def __init__(self):
self.conn = None
self.cursor = None
def open_spider(self, spider):
self.conn = pymysql.connect(
host='localhost',
user='your_username',
password='your_password',
database='poetry_db',
charset='utf8mb4'
)
self.cursor = self.conn.cursor()
self.create_table()
def create_table(self):
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS famous_poems (
id INT AUTO_INCREMENT PRIMARY KEY,
famous_quote TEXT NOT NULL,
quote_source VARCHAR(255),
poem_title VARCHAR(255) NOT NULL,
poem_author VARCHAR(100),
poem_dynasty VARCHAR(50),
poem_content LONGTEXT NOT NULL,
source_url VARCHAR(512),
crawl_time DATETIME,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE KEY (famous_quote(100), poem_title(100))
ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
""")
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("""
INSERT INTO famous_poems (
famous_quote, quote_source, poem_title,
poem_author, poem_dynasty, poem_content,
source_url, crawl_time
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE poem_content=VALUES(poem_content)
""", (
item['famous_quote'],
item['quote_source'],
item['poem_title'],
item['poem_author'],
item['poem_dynasty'],
item['poem_content'],
item['url'],
item['crawl_time']
))
self.conn.commit()
except Exception as e:
self.conn.rollback()
raise e
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
五、配置settings.py
BOT_NAME = 'gushiwen'
SPIDER_MODULES = ['gushiwen.spiders']
NEWSPIDER_MODULE = 'gushiwen.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1.5
CONCURRENT_REQUESTS_PER_DOMAIN = 2
ITEM_PIPELINES = {
'gushiwen.pipelines.JsonWriterPipeline': 300,
'gushiwen.pipelines.MySQLPipeline': 400,
}
# 日志设置
LOG_LEVEL = 'INFO'
LOG_FILE = 'gushiwen.log'
六、运行爬虫
scrapy crawl mingju -o output.json
七、MySQL表结构说明
CREATE TABLE `famous_poems` (
`id` int NOT NULL AUTO_INCREMENT,
`famous_quote` text COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '名句内容',
`quote_source` varchar(255) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '名句出处',
`poem_title` varchar(255) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '诗词标题',
`poem_author` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '作者',
`poem_dynasty` varchar(50) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '朝代',
`poem_content` longtext COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '诗词全文',
`source_url` varchar(512) COLLATE utf8mb4_unicode_ci DEFAULT NULL COMMENT '来源URL',
`crawl_time` datetime DEFAULT NULL COMMENT '爬取时间',
`created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
PRIMARY KEY (`id`),
UNIQUE KEY `famous_quote` (`famous_quote`(100),`poem_title`(100))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
DAMO开发者矩阵,由阿里巴巴达摩院和中国互联网协会联合发起,致力于探讨最前沿的技术趋势与应用成果,搭建高质量的交流与分享平台,推动技术创新与产业应用链接,围绕“人工智能与新型计算”构建开放共享的开发者生态。
更多推荐


所有评论(0)