过滤非中文或长度小于2的词
keywords = [word for word in words if len(word) = 2 and re.match(r'[\u4e00-\u9fff]+', word)]
return keywords
存储到 MySQL
def save_to_mysql(keywords):
connection = pymysql.connect(MYSQL_CONFIG)
try:
with connection.cursor() as cursor:
for keyword in set(keywords): 去重
查询是否已存在
sql_check = SELECT * FROM keywords WHERE keyword = %s
cursor.execute(sql_check, (keyword,))
result = cursor.fetchone()
if result:
如果存在,则更新计数
sql_update = UPDATE keywords SET count = count + 1 WHERE keyword = %s
cursor.execute(sql_update, (keyword,))
else:
否则插入新记录
sql_insert = INSERT INTO keywords (keyword) VALUES (%s)
cursor.execute(sql_insert, (keyword,))
connection.commit()
finally:
connection.close()
主函数
if __name__ == __main__:
url = 'https://65.hk' 替换为你的目标网址
keywords = extract_keywords(url)
print(fExtracted {len(keywords)} keywords.)
save_to_mysql(keywords)
print(Keywords saved to MySQL.)