本帖最后由 liuye 于 2022-4-18 16:53 编辑
最近在做一个和英语学习相关的网站需要一个汉英词典数据库,淘宝上的那些又贵又不全,所以自己写了个爬虫爬一下,现在分享给大家
[Python] 纯文本查看 复制代码 import pymysql
import requests
from lxml import etree
#执行插入语句
def commitSQL(conn,sql):
cursor = conn.cursor()
try:
cursor.execute(sql)
conn.commit()
except Exception as e:
conn.rollback()
#获取数据库连接
def getConnect():
canshu = {}
#载入配置文件
with open('configure','r') as f :
for x in f.read().split('\n'):
canshu[x.split('=')[0]] =x.split('=')[-1]
conn = pymysql.connect(host=canshu['host'],
port=int(canshu['port']),
user=canshu['user'],
password=canshu['password'],
database=canshu['database'],
charset=canshu['charset'])
return conn
def getInfo(word):
print("正在爬取单词:"+word)
respsons = requests.get("https://dict.eudic.net/dicts/en/"+word)
html = etree.HTML(respsons.content)
english=[]
chinese=[]
try:
meaningXpath = html.xpath("//ol/li/text()")
if (meaningXpath == []):
meaningXpath = html.xpath("//div[@class='exp']/text()")
British = html.xpath("//span[@class='Phonitic'][1]/text()")[0]
American = html.xpath("//span[@class='Phonitic'][last()]/text()")[0]
englishXpath = html.xpath("//p[@class='line']")
chineseXpath = html.xpath("//div[@class='sentence']//p[@class='exp']")
for i in englishXpath:
english.append(i.xpath("string(.)"))
for i in chineseXpath:
chinese.append(i.xpath("string(.)"))
print("单词" + word + "爬取成功")
print(meaningXpath, British, American, english, chinese)
return meaningXpath,British,American,english,chinese
except Exception:
print("单词" + word +"爬取失败")
return None
def save(word):
print("正在存储单词:"+word)
try:
meaningXpath, British, American, english, chinese = getInfo(word)
connect = getConnect()
commitSQL(connect, f"insert into word(word,british_pronunciation,american_pronunciation) "
f"values ('{word}',\"{British}\",\"{American}\")")
for i in meaningXpath:
commitSQL(connect, f"insert into meaning(meaning,word) "
f"values (\"{i}\",\"{word}\")")
for i in range(0, len(english)):
commitSQL(connect, f"insert into example(word,english,chinese) "
f"values (\"{word}\",\"{english[i]}\",\"{chinese[i]}\")")
print("单词" + word +"存储成功")
for i in english:
english = i.replace(",", "").replace(".", "").replace("?", "").replace("!", "").lower().split(" ")
for x in english:
run(x)
except Exception:
print("单词" + word+"存储失败")
def run(startWord):
cur = getConnect().cursor()
sql = f"SELECT * FROM word WHERE word='{startWord}'"
cur.execute(sql)
count = len(cur.fetchall())
if count == 0 :
save(startWord)
if __name__ == '__main__':
word='test'
run(word)
配置文件(与代码同级目录下命名为configure)
[XML] 纯文本查看 复制代码 host=localhost
port=3306
user=root
password=root
database=dict
charset=utf8
数据库结构
[SQL] 纯文本查看 复制代码 /*
SQLyog Enterprise v12.09 (64 bit)
MySQL - 5.5.40 : Database - dict
*********************************************************************
*/
/*!40101 SET NAMES utf8 */;
/*!40101 SET SQL_MODE=''*/;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
CREATE DATABASE /*!32312 IF NOT EXISTS*/`dict` /*!40100 DEFAULT CHARACTER SET utf8 */;
USE `dict`;
/*Table structure for table `example` */
DROP TABLE IF EXISTS `example`;
CREATE TABLE `example` (
`word` varchar(64) DEFAULT NULL,
`english` text,
`chinese` text
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
/*Table structure for table `meaning` */
DROP TABLE IF EXISTS `meaning`;
CREATE TABLE `meaning` (
`meaning` text,
`word` char(64) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
/*Table structure for table `word` */
DROP TABLE IF EXISTS `word`;
CREATE TABLE `word` (
`word` varchar(64) DEFAULT NULL,
`british_pronunciation` varchar(128) DEFAULT NULL,
`american_pronunciation` varchar(128) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
大概原理就是输入起始单词,爬取数据(包括 英音 美音 翻译 例句)后 对例句中的所有单词进行同样的操作。
频繁访问这个api是会封IP的,所以要么拉长战线要么买代{过}{滤}理IP的服务 |