欧路词典爬虫

liuye 发表于 2022-4-18 16:22

本帖最后由 liuye 于 2022-4-18 16:53 编辑

最近在做一个和英语学习相关的网站需要一个汉英词典数据库，淘宝上的那些又贵又不全，所以自己写了个爬虫爬一下，现在分享给大家
import pymysql
import requests
from lxml import etree

#执行插入语句
def commitSQL(conn,sql):
cursor = conn.cursor()
try:
   cursor.execute(sql)
   conn.commit()
except Exception as e:
   conn.rollback()

#获取数据库连接
def getConnect():
canshu = {}
#载入配置文件
with open('configure','r') as f :
   for x in f.read().split('\n'):
         canshu] =x.split('=')[-1]
conn = pymysql.connect(host=canshu['host'],
                        port=int(canshu['port']),
                        user=canshu['user'],
                        password=canshu['password'],
                        database=canshu['database'],
                        charset=canshu['charset'])
return conn

def getInfo(word):
print("正在爬取单词:"+word)
respsons = requests.get("https://dict.eudic.net/dicts/en/"+word)
html = etree.HTML(respsons.content)
english=[]
chinese=[]
try:
   meaningXpath = html.xpath("//ol/li/text()")
   if (meaningXpath == []):
         meaningXpath = html.xpath("//div[@class='exp']/text()")
   British = html.xpath("//span[@class='Phonitic']/text()")
   American = html.xpath("//span[@class='Phonitic']/text()")
   englishXpath = html.xpath("//p[@class='line']")
   chineseXpath = html.xpath("//div[@class='sentence']//p[@class='exp']")
   for i in englishXpath:
         english.append(i.xpath("string(.)"))
   for i in chineseXpath:
         chinese.append(i.xpath("string(.)"))
   print("单词" + word + "爬取成功")
   print(meaningXpath, British, American, english, chinese)
   return meaningXpath,British,American,english,chinese
except Exception:
   print("单词" + word +"爬取失败")
   return None

def save(word):
print("正在存储单词:"+word)
try:
   meaningXpath, British, American, english, chinese = getInfo(word)
   connect = getConnect()
   commitSQL(connect, f"insert into word(word,british_pronunciation,american_pronunciation) "
                        f"values ('{word}',\"{British}\",\"{American}\")")
   for i in meaningXpath:
         commitSQL(connect, f"insert into meaning(meaning,word) "
                           f"values (\"{i}\",\"{word}\")")
   for i in range(0, len(english)):
         commitSQL(connect, f"insert into example(word,english,chinese) "
                           f"values (\"{word}\",\"{english}\",\"{chinese}\")")
   print("单词" + word +"存储成功")
   for i in english:
         english = i.replace(",", "").replace(".", "").replace("?", "").replace("!", "").lower().split(" ")
         for x in english:
            run(x)
except Exception:
   print("单词" + word+"存储失败")

def run(startWord):
cur = getConnect().cursor()
sql = f"SELECT * FROM word WHERE word='{startWord}'"
cur.execute(sql)
count = len(cur.fetchall())
if count == 0 :
   save(startWord)

if __name__ == '__main__':
word='test'
run(word)

配置文件(与代码同级目录下命名为configure)

host=localhost
port=3306
user=root
password=root
database=dict
charset=utf8
数据库结构
/*
SQLyog Enterprise v12.09 (64 bit)
MySQL - 5.5.40 : Database - dict
*********************************************************************
*/

/*!40101 SET NAMES utf8 */;

/*!40101 SET SQL_MODE=''*/;

/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
CREATE DATABASE /*!32312 IF NOT EXISTS*/`dict` /*!40100 DEFAULT CHARACTER SET utf8 */;

USE `dict`;

/*Table structure for table `example` */

DROP TABLE IF EXISTS `example`;

CREATE TABLE `example` (
`word` varchar(64) DEFAULT NULL,
`english` text,
`chinese` text
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

/*Table structure for table `meaning` */

DROP TABLE IF EXISTS `meaning`;

CREATE TABLE `meaning` (
`meaning` text,
`word` char(64) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

/*Table structure for table `word` */

DROP TABLE IF EXISTS `word`;

CREATE TABLE `word` (
`word` varchar(64) DEFAULT NULL,
`british_pronunciation` varchar(128) DEFAULT NULL,
`american_pronunciation` varchar(128) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;

大概原理就是输入起始单词，爬取数据(包括英音美音翻译例句)后对例句中的所有单词进行同样的操作。
频繁访问这个api是会封IP的，所以要么拉长战线要么买代{过}{滤}理IP的服务

zohoChou 发表于 2022-4-22 23:27

liuye 发表于 2022-4-22 23:16
主要是代{过}{滤}理IP太贵了

https://blog.csdn.net/weixin_44613063/article/details/102538757

jimmywong85 发表于 2022-4-18 16:33

很厉害，但不知道是怎么用的

aeqvkec 发表于 2022-4-18 16:43

厉害, 先收藏学习, 未来应该用得着

hawk005 发表于 2022-4-18 16:57

牛牛牛，向楼主学习！

WanShao 发表于 2022-4-18 17:21

请问代码怎么用

Flytom 发表于 2022-4-18 17:52

牛牛牛，向楼主学习！

liu2514 发表于 2022-4-18 18:42

Python小白来取取经！感谢分享！

头狼发表于 2022-4-18 19:01

大佬请分享个爬好的数据吧{:1_918:}

f23258 发表于 2022-4-18 20:51

哈，在手机上直接买了正版。

Cacarot 发表于 2022-4-18 21:50

多谢，收藏备用

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

欧路词典爬虫