中关村手机信息python爬取并保存进mysql中
import pymysqlimport urllib.request
import re
"""
sqlName 数据库hosts 默认 127.0.0.1
sqluser 数据库用户名 默认 root
sqlpass 数据库密码
database 数据库名称
"""
sqlName = "127.0.0.1"
sqluser = "root"
sqlpass = "*"
database = "*"
"""
min_page 起步页
sum_page 结束页
page_number 一页数量
"""
min_page = 1
sum_page = 2
page_number = 48
"""基础数据更新的三个步骤"""
def get_phone_datas(url):
"""获取基础信息数据"""
html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
"""裁切 从下面的文字开始"""
html = html.split("title=\"产品评分由高到低\"")
html = html[1]
html = html.split("<h3>手机品牌报价大全</h3>")
html = html[0]
# 对空格回车动手
html = html.replace("\r", "")
html = html.replace("\t", "")
html = html.replace("\n", "")
html = html.replace(" ", "")
html = html.replace("<br/>", "")
html = html.replace("<p>", "")
html = html.replace("</p>", "")
regex = "<imgwidth=\"220\"height=\"165\".src=\"(.*?)\"alt=\"(.*?)\"></a>"\
"<h3><ahref=\"(.*?)\"title=.*?<span>(.*?)</span></a></h3>.*?"\
"</b><b.*?class=\"price-type\">(\d*?)</b>.*?</span><divclass=\"goods-promotion\">" \
"</div>"
regex = re.compile(regex)
datas = re.findall(regex, html)
# 定义一个所有数据 列表
listdata = []
for data in datas:
# 取出id 号码一列
num = str(data[2])
# 取出 真正的id号码
num = re.sub("\\D", "", num)
# 定义一个列表放 当前行数据
listnum = data + tuple(num.split())
print(listnum)
listdata.append(listnum)
return listdata
def set_phone_mysql(datas):
"""增加数据"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "insert into phone_base" \
"(base_image, base_name, base_remarks, base_feature, base_price, base_id)"\
"values (%s, %s, %s, %s, %s, %s);"
try:
cursor.executemany(sql, datas)
conn.commit()
print("成功")
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql, end="")
print(datas)
cursor.close()
conn.close()
return 1
def set_phone_one(data):
"""单个数据插入"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "insert into phone_base" \
"(base_image, base_name, base_remarks, base_feature, base_price, base_id)" \
"values (%s, %s, %s, %s, %s, %s);"
try:
cursor.execute(sql, data)
conn.commit()
print("成功")
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:", end="")
print(data)
cursor.close()
conn.close()
"""数据url更新的5个步骤"""
def get_data_forsqlurl(start, end):
"""数据库base表中取得image路径"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "select base_id, base_remarks from phone_base ORDER BY base_id DESC"
sql = sql + " limit " + str(start) + ", " + str(end)
print(sql)
try:
cursor.execute(sql)
datas = cursor.fetchall()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return datas
def geturl_data(url):
"""获取到的所有数据网站的url"""
html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
"""裁切 从下面的文字开始"""
html = html.split("综述介绍")
html = html[1]
html = html.split("参数")
html = html[0]
if len(html) > 115:
html = html.split("图片")
html = html[1]
html = html.replace(" ", "")
print(html)
regex = "<ahref=\"(.*?)\"target="
regex = re.compile(regex)
url = re.findall(regex, html)
# print("url\t", url)
return "http://detail.zol.com.cn" + ''.join(url)
def get_data(url):
"""获取到全部数据的网站,用来获取参数列表"""
html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
html = html.split('>外观</td>')
html = html[1]
html = html.split("功能与服务")
html = html[0]
html = re.sub("<a.*?/a>", "", html)
html = re.sub("<i.*?/a>", "", html)
# 对空格回车动手
html = html.replace("\r", "")
html = html.replace("\n", "")
# html = html.replace(" ", "")
html = html.replace("<br />", "+")
html = html.replace("<br/>", "+")
# html = html.replace("<p>", "")
# html = html.replace("</p>", "")
# 干掉空格
html = html.replace(" ", "")
regex = "<span id=\"newPmVal_({1,2})\">(.*?)</span>"
# 匹配正则式
datas = re.findall(regex, html)
# 创建list 列表
listdata = []
# 参数长度
length = len(datas)
# 根据参数长度来判断 添加内容 6去掉空格 4,5不变 3修改
if length == 6:
del datas[4]
for data in datas:
listdata.append(data[1])
elif length == 3:
if datas[2][1] == '':
for data in datas:
listdata.append(data[1])
listdata.insert(1, '')
listdata.insert(1, '')
else:
for data in datas:
listdata.append(data[1])
elif length < 2:
listdata.append(datas[0][1])
else:
for data in datas:
listdata.append(data[1])
# 长度不够,加空格来凑5个元素
while len(listdata) < 5:
listdata.append("")
print(listdata)
return listdata
def set_facade_mysql(facade_id, listdata):
"""增加数据facade_id外观idlistdata 数据"""
listdata.append(facade_id)
listdata = tuple(listdata[1:])
print("listdata:\t", listdata)
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "insert into phone_facade" \
"(facade_id, facade_size, facade_weight, facade_texture, facade_other) " \
"values (%s, %s, %s, %s, %s);"
try:
#2阶 sql语句
# cursor.executemany(sql, listdata)
#
cursor.execute(sql, listdata)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def set_testsql(uid, url):
listdata = []
listdata.append(uid)
listdata.append(url)
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "insert into testurl" \
"(uid, url) " \
"values (%s, %s);"
try:
#2阶 sql语句
# cursor.executemany(sql, listdata)
# 元组列表插入sql
cursor.execute(sql, listdata)
conn.commit()
print("成功")
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + str(uid))
cursor.close()
conn.close()
return 1
def get_url_for_test_url(start, end):
""""获取所有数据"""
"""数据库 test_url表中取得参数路径"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
# 从test_url 表中将id 和 url取出来
sql = "select uid, url from testurl ORDER BY uid DESC"
sql = sql + " limit " + str(start) + ", " + str(end)
try:
cursor.execute(sql)
datas = cursor.fetchall()
except Exception as e:
print("失败:" + sql)
cursor.close()
conn.close()
return datas
def get_data(url):
"""获取到全部数据的网站,用来获取参数列表"""
html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
if html.find('基本参数') != -1:
html = html.split('>基本参数</td>')
html = html[1]
html = html.split("详细内容")
html = html[0]
else:
html = html.split('详细参数')
html = html[1]
html = html.split("接下来要看")
html = html[0]
# html = re.sub("<a.*?/<a>", "", html)
# pat = re.compile('>(.*?)<')
# print("html\t", ''.join(pat.findall(html)))
# print(html)
# 对空格回车动手
html = html.replace("\r", "")
html = html.replace("\n", "")
html = html.replace(" ", "")
# html = html.replace("<br />", "+")
# html = html.replace("<br/>", "+")
# html = html.replace("<p>", "")
# html = html.replace("</p>", "")
# html = html.replace("></a>", "")
# html = html.replace("</a>", "")
html = html.replace("纠错", "")
# 干掉空格
html = html.replace(" ", "")
# 对表格中的链接动手
# html = re.sub("<ahref.*?>", "", html)
# html = re.sub("<i.*?></i>", "", html)
# html = re.sub("<aclass=.*?id=", "<spanid=", html)
# html = re.sub("href.*?text=\"\">", ">", html)
# print(html)
regex = "<th.*?id=.*?>(.*?)</.*?></th><td.*?<spanid=\"newPmVal_({1,2})\">(.*?)</span>.*?</td></tr>"
# 匹配正则式
datas = re.findall(regex, html)
# for data in datas:
# print(data)
return datas
def get_config_data(con_id, tuple_data):
"""对数据进行清洗,得到符合的列表"""
dict_data = {0: con_id, 1: '', 2: '', 3: '', 4: '', 5: ''}
for data in tuple_data:
# 利用字符串替代来剔除干扰元素
data_str = data[2]
# 去掉 标签
data_str = re.sub("<.*?>", "", data_str)
data_str = re.sub("</.*?>", "", data_str)
data_str = data_str.replace(">", "")
if data[0] == 'CPU型号':
dict_data[1] += (data_str + " ")
if data[0] in ('RAM容量', 'ROM容量'):
dict_data[2] += (data_str + " ")
if data[0] in ('解锁方式', ''):
dict_data[3] += (data_str + " ")
if data[0] in ('连接与共享', '机身接口', 'NFC', 'WLAN功能', '导航', '感应器类型', '机身接口', '多媒体技术'):
dict_data[4] += (data_str + " ")
if data[0] in ('5G网络', '4G网络', '3G网络', '支持频段', 'SIM卡类型', '其他网络参数'):
dict_data[5] += (data_str + " ")
# print(dict_data)
return list(dict_data.values())
def set_config_mysql(listdatas):
"""增加配置信息数据listdatas 数据集"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
# 防止插入失败,所以先删表
# cursor.execute("DELETE FROM phone_config")
sql = "insert into phone_config" \
"(config_id, soc_name, config_memory, config_unlock, config_networking, config_remork) "\
"values (%s, %s, %s, %s, %s, %s);"
try:
# 2阶 sql语句
# cursor.executemany(sql, listdatas)
# 元组列表插入sql
cursor.execute(sql, listdatas)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def get_facade_data(con_id, tuple_data):
"""对数据进行清洗,得到符合的列表"""
dict_data = {0: con_id, 1: '', 2: '', 3: '', 4: ''}
for data in tuple_data:
# 利用字符串替代来剔除干扰元素
data_str = data[2]
# 去掉 标签
data_str = re.sub("<.*?>", "", data_str)
data_str = re.sub("</.*?>", "", data_str)
data_str = data_str.replace(">", "")
if data[0] == '手机尺寸':
dict_data[1] += (data_str + " ")
if data[0] == '手机重量':
dict_data[2] += (data_str + " ")
if data[0] == '机身材质':
dict_data[3] += (data_str + " ")
if data[0] == '其他外观参数':
dict_data[4] += (data_str + " ")
# print(dict_data)
return list(dict_data.values())
def set_facade_mysql(listdatas):
"""增加数据facade_id外观idlistdata 数据"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
# cursor.execute("DELETE FROM phone_facade")
sql = "insert into phone_facade" \
"(facade_id, facade_size, facade_weight, facade_texture, facade_other) "\
"values (%s, %s, %s, %s, %s);"
try:
#2阶 sql语句
# cursor.executemany(sql, listdatas)
cursor.execute(sql, listdatas)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def get_show_data(show_id, tuple_data):
"""对数据进行清洗,得到符合的屏幕参数列表"""
dict_data = {'ID': show_id, 'texture': '', 'size': '', 'resolratio': '', 'arrange': '',
'ppi': '', 'aod': '', 'colorControl': '', 'maxLuinance': '', 'stimulateLuminance': '',
'colorTem': '', 'srgb': '', 'p3': '', 'quality': '', 'other': ''}
for data in tuple_data:
# 利用字符串替代来剔除干扰元素
data_str = data[2]
# 去掉 标签
data_str = re.sub("<.*?>", "", data_str)
data_str = re.sub("</.*?>", "", data_str)
data_str = data_str.replace(">", "")
if data[0] == '主屏尺寸':
dict_data['size'] += data_str
if data[0] == '主屏材质':
dict_data['texture'] += data_str
if data[0] == '主屏分辨率':
dict_data['resolratio'] += data_str
if data[0] == '屏幕像素密度':
dict_data['ppi'] += data_str
if data[0] in ('窄边框', 'HDR技术', '屏幕占比', '屏幕技术'):
dict_data['other'] += (data[0] + ":" + data_str + " ")
if data[0] == '其他屏幕参数':
dict_data['other'] += (data_str + " ")
# print(dict_data)
return list(dict_data.values())
def set_show_mysql(listdatas):
"""增加屏幕信息数据listdatas 数据集"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
# cursor.execute("DELETE FROM phone_show")
sql = "insert into phone_show" \
" " \
"values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
try:
# 2阶 sql语句
# cursor.executemany(sql, listdatas)
# 元组列表插入sql
cursor.execute(sql, listdatas)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def get_battery_data(bat_id, tuple_data):
"""对数据进行清洗,得到符合的电池参数列表"""
dict_data = {'ID': bat_id, 'capacity': '', 'scheme': '', 'cableCharger': '', 'wirelessCharger': '',
'cableTime': '', 'wirelessTime': '', 'endurance': '', 'other': ''}
for data in tuple_data:
# 利用字符串替代来剔除干扰元素
data_str = data[2]
# 去掉 标签
data_str = re.sub("<.*?>", "", data_str)
data_str = re.sub("</.*?>", "", data_str)
data_str = data_str.replace(">", "")
if data[0] == '电池容量':
dict_data['capacity'] += data_str
if data[0] == '续航时间':
dict_data['endurance'] += data_str
if data[0] == '电池充电':
dict_data['scheme'] += data_str
if data[0] == '其他硬件参数':
dict_data['other'] += data_str
# print(dict_data)
return list(dict_data.values())
def set_battery_mysql(listdatas):
"""增加电池信息数据listdatas 数据集"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
# cursor.execute("DELETE FROM phone_battery")
# conn.commit()
sql = "insert into phone_battery" \
" " \
"values (%s, %s, %s, %s, %s, %s, %s, %s, %s);"
try:
# 2阶 sql语句
# cursor.executemany(sql, listdatas)
# 元组列表插入sql
cursor.execute(sql, listdatas)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def get_camera_data(camera_id, tuple_data):
"""对数据进行清洗,得到符合的相机参数列表"""
dict_data = {'ID': camera_id, 'amount': '', 'cmos': '', 'pixel': '', 'focalLength': '',
'aperture': '', 'ois': '', 'fosusing': '', 'graph': '', 'video': '',
'steadyVideo': '', 'slowMotion': '', 'remarks': ''}
for data in tuple_data:
# 利用字符串替代来剔除干扰元素
data_str = data[2]
# 去掉 标签
data_str = re.sub("<.*?>", "", data_str)
data_str = re.sub("</.*?>", "", data_str)
data_str = data_str.replace(">", "")
if data[0] == '摄像头总数':
dict_data['amount'] += data_str
if data[0] == '传感器型号':
dict_data['cmos'] += data_str
if data[0] in ('前置摄像头', '后置摄像头', '摄像头特色', '广角'):
dict_data['pixel'] += (data[0] + ":" + data_str + " ")
if data[0] == '焦距/范围':
dict_data['focalLength'] += data_str
if data[0] == '光圈':
dict_data['aperture'] += data_str
if data[0] == '拍照功能':
dict_data['graph'] += data_str
if data[0] == '视频拍摄':
dict_data['video'] += data_str
if data[0] == '其他摄像头参数':
dict_data['remarks'] += (data_str + " ")
# print(dict_data)
return list(dict_data.values())
def set_camera_mysql(listdatas):
"""增加相机信息数据listdatas 数据集"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
# cursor.execute("DELETE FROM phone_camera")
sql = "insert into phone_camera" \
" " \
"values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
try:
# 2阶 sql语句
# cursor.executemany(sql, listdatas)
# 元组列表插入sql
cursor.execute(sql, listdatas)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def get_img_list(url):
"""爬取网站,获取正确的图片集"""
html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
# 无轮播图处理
if html.find('nav end') != -1:
html = html.split('nav end')
html = html[1]
html = html.split("SPU")
html = html[0]
regex = "title=\"(.*?)\" class.*?swiper-lazy.*?//(.*?.jpg).*?>"
else:
print("无轮播图:" + url)
html = html.split('item-pic')
html = html[1]
html = html.split("点击看更多图片")
html = html[0]
regex = ">()<a.*?//(.*?.jpg).*?>"
# 匹配正则式
datas = re.findall(regex, html)
# for data in datas:
# print(data)
return datas
def set_img_sql_list(listdatas):
"""增加相机信息数据listdatas 数据集"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "insert into phone_photo" \
" " \
"values (null , %s, %s, %s, %s);"
try:
# 2阶 sql语句
# cursor.executemany(sql, listdatas)
# 元组列表插入sql
cursor.execute(sql, listdatas)
conn.commit()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return 1
def replace_sub(str):
"""正则式替换图片中的_axb"""
p = "_(.*?)/"
# print(re.compile(p).findall(str))
str = re.sub(p, "/", str)
return str
def get_repetition_photo_forsql():
"""获得数据库里面重复图集的数据"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "select photo_id FROM phone_photo \
where photo_id not in \
( \
select min(photo_id) as id from phone_photo GROUP BY photo_min \
);"
try:
cursor.execute(sql)
data = cursor.fetchall()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return data
def delete_repetition_photo_mysql(datas):
"""删除重复的图集数据"""
conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "delete from phone_photo where photo_id = %s"
try:
# cursor.executemany(sql, datas)
cursor.execute(sql, datas)
conn.commit()
print("成功")
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql, end="\t")
print(datas)
cursor.close()
conn.close()
return 1
if __name__ == "__main__":
"""第一步,更新基础数据"""
# url = "http://detail.zol.com.cn/cell_phone_index/subcate57_list_"
url = "http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_9_2_0_"
for i in range(min_page, sum_page):
datas = get_phone_datas(url + str(i) + ".html")
for data in datas:
set_phone_one(data)
"""第二步更新url, 按照最近200条更新"""
datas = get_data_forsqlurl((min_page - 1) * page_number, sum_page * page_number)
for data in datas:
facade_id = data[0]
url = ''.join(data[1])
# 每台设备参数网址的获取
url = "http://detail.zol.com.cn" + url
url = geturl_data(url)
print("uid\t", facade_id)
print("参数详情url:", url)
set_testsql(facade_id, url)
"""第三步,保存详细配置"""
datas = get_url_for_test_url((min_page - 1) * page_number, sum_page * page_number)
# 二阶列表,存放需要的数据集
listdatas = []
for data in datas:
config_id = data[0]
# 拼接得到真正的链接
url = ''.join(data[1])
print("url\t", url)
data = get_data(url)
set_config_mysql((tuple(get_config_data(config_id, data))))
set_battery_mysql(tuple(get_battery_data(config_id, data)))
set_camera_mysql(tuple(get_camera_data(config_id, data)))
set_facade_mysql(tuple(get_facade_data(config_id, data)))
set_show_mysql(tuple(get_show_data(config_id, data)))
"""第四步,保存图集, 会出现大量重复数据,先不插入"""
datas = get_url_for_test_url((min_page - 1) * page_number, sum_page * page_number + 40)
# 二阶列表,存放需要的数据集
listdatas = []
for data in datas:
config_id = data[0]
# 拼接得到真正的链接
url = ''.join(data[1])
# 处理url
url = url.replace("http://detail.zol.com.cn", "https://wap.zol.com.cn").replace("param.shtml", "index.html")
# print(url)
try:
tuple_datas = get_img_list(url)
except Exception as e:
print("失败:" + url)
continue
for tuple_data in tuple_datas:
img = tuple_data[1]
img = replace_sub(img)
list_data = list(tuple_data)
# 第一个元素插入
list_data.insert(0, config_id)
# 最后一个元素插入
list_data.append(img)
# print(list_data)
set_img_sql_list(tuple(list_data))
"""去除重复的图集数据"""
datas = get_repetition_photo_forsql()
for data in datas:
print(data[0])
delete_repetition_photo_mysql(data[0])
好长,看着好厉害 大哥,你打开代码界面啊,这样看好累的 copy下来,学习学习:lol
页:
[1]