1、数据库表结构:
sql
CREATE TABLE spider_content (
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
content TEXT,
keywords VARCHAR(255),
description TEXT,
source_url VARCHAR(512),
publish_time DATETIME,
update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
status TINYINT DEFAULT 1
);
2、内容来源表:
sql
CREATE TABLE content_sources (
id INT AUTO_INCREMENT PRIMARY KEY,
domain VARCHAR(255),
crawl_rules TEXT,
update_frequency INT,
last_crawl_time DATETIME
);
内容聚合策略
1、基于关键词的聚合:
使用MySQL的全文索引功能实现相关内容聚合
sql
ALTER TABLE spider_content ADD FULLTEXT(title, content, keywords);
2、时间维度聚合:
sql
SELECT * FROM spider_content
WHERE publish_time BETWEEN '2023-01-01' AND '2023-12-31'
ORDER BY publish_time DESC;
3、多源内容去重:
sql
SELECT DISTINCT(title), content FROM spider_content
GROUP BY MD5(content) HAVING COUNT(*) = 1;
性能优化方案
1、分区表:
sql
ALTER TABLE spider_content PARTITION BY RANGE (YEAR(publish_time)) (
PARTITION p2020 VALUES LESS THAN (2021),
PARTITION p2021 VALUES LESS THAN (2022),
PARTITION p2022 VALUES LESS THAN (2023),
PARTITION pmax VALUES LESS THAN MAXVALUE
);
2、读写分离:
主库负责写入
从库集群负责查询
3、缓存策略:
使用Redis缓存热门聚合结果
实际应用示例
1、生成聚合页面:
sql
SELECT c1.* FROM spider_content c1
JOIN (
SELECT keywords FROM spider_content
WHERE id = 12345
) c2 ON c1.keywords LIKE CONCAT('%', c2.keywords, '%')
ORDER BY RAND() LIMIT 10;
2、定时更新任务:
sql
UPDATE spider_content
SET status = 0
WHERE update_time < DATE_SUB(NOW(), INTERVAL 30 DAY);