def geturl_data(url): """获取到的所有数据网站的url""" html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
"""裁切 从下面的文字开始"""
html = html.split("综述介绍")
html = html[1]
html = html.split("参数")
html = html[0]
if len(html) > 115:
html = html.split("图片")
html = html[1]
html = html.replace(" ", "")
print(html)
regex = "<ahref=\"(.*?)\"target="
regex = re.compile(regex)
url = re.findall(regex, html)
# print("url\t", url)
return "http://detail.zol.com.cn" + ''.join(url)
def get_data(url): """获取到全部数据的网站,用来获取参数列表""" html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
html = html.split('>外观</td>')
html = html[1]
html = html.split("功能与服务")
html = html[0]
html = re.sub("<a.*?/a>", "", html)
html = re.sub("<i.*?/a>", "", html)
# 对空格回车动手
html = html.replace("\r", "")
html = html.replace("\n", "")
# html = html.replace(" ", "")
html = html.replace("<br />", "+")
html = html.replace("<br/>", "+")
# html = html.replace("<p>", "")
# html = html.replace("</p>", "")
# 干掉空格
html = html.replace(" ", "")
regex = "<span id=\"newPmVal_([0-9]{1,2})\">(.*?)</span>"
# 匹配正则式
datas = re.findall(regex, html)
# 创建list 列表
listdata = []
# 参数长度
length = len(datas)
# 根据参数长度来判断 添加内容 6去掉空格 4,5不变 3修改
if length == 6:
del datas[4]
for data in datas:
listdata.append(data[1])
elif length == 3:
if datas[2][1] == '':
for data in datas:
listdata.append(data[1])
listdata.insert(1, '')
listdata.insert(1, '')
else:
for data in datas:
listdata.append(data[1])
elif length < 2:
listdata.append(datas[0][1])
else:
for data in datas:
listdata.append(data[1])
# 长度不够,加空格来凑5个元素
while len(listdata) < 5:
listdata.append("")
print(listdata)
return listdata
def get_data(url): """获取到全部数据的网站,用来获取参数列表""" html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
if html.find('基本参数') != -1:
html = html.split('>基本参数</td>')
html = html[1]
html = html.split("详细内容")
html = html[0]
else:
html = html.split('详细参数')
html = html[1]
html = html.split("接下来要看")
html = html[0]
# html = re.sub("<a.*?/<a>", "", html)
# pat = re.compile('>(.*?)<')
# print("html\t", ''.join(pat.findall(html)))
# print(html)
# 对空格回车动手
html = html.replace("\r", "")
html = html.replace("\n", "")
html = html.replace(" ", "")
# html = html.replace("<br />", "+")
# html = html.replace("<br/>", "+")
# html = html.replace("<p>", "")
# html = html.replace("</p>", "")
# html = html.replace("></a>", "")
# html = html.replace("</a>", "")
html = html.replace("纠错", "")
# 干掉空格
html = html.replace(" ", "")
# 对表格中的链接动手
# html = re.sub("<ahref.*?>", "", html)
# html = re.sub("<i.*?></i>", "", html)
# html = re.sub("<aclass=.*?id=", "<spanid=", html)
# html = re.sub("href.*?text=\"\">", ">", html)
# print(html)
regex = "<th.*?id=.*?>(.*?)</.*?></th><td.*?<spanid=\"newPmVal_([0-9]{1,2})\">(.*?)</span>.*?</td></tr>"
# 匹配正则式
datas = re.findall(regex, html)
# for data in datas:
# print(data)
return datas
def get_config_data(con_id, tuple_data): """对数据进行清洗,得到符合的列表""" dict_data = {0: con_id, 1: '', 2: '', 3: '', 4: '', 5: ''}
for data in tuple_data:
# 利用字符串替代来剔除干扰元素
data_str = data[2]
# 去掉 标签
data_str = re.sub("<.*?>", "", data_str)
data_str = re.sub("</.*?>", "", data_str)
data_str = data_str.replace(">", "")
if data[0] == 'CPU型号':
dict_data[1] += (data_str + " ")
if data[0] in ('RAM容量', 'ROM容量'):
dict_data[2] += (data_str + " ")
if data[0] in ('解锁方式', ''):
dict_data[3] += (data_str + " ")
if data[0] in ('连接与共享', '机身接口', 'NFC', 'WLAN功能', '导航', '感应器类型', '机身接口', '多媒体技术'):
dict_data[4] += (data_str + " ")
if data[0] in ('5G网络', '4G网络', '3G网络', '支持频段', 'SIM卡类型', '其他网络参数'):
dict_data[5] += (data_str + " ")
# print(dict_data)
return list(dict_data.values())
def get_img_list(url): """爬取网站,获取正确的图片集""" html = urllib.request.urlopen(url).read()
html = html.decode("gbk")
html = str(html)
# 无轮播图处理
if html.find('nav end') != -1:
html = html.split('nav end')
html = html[1]
html = html.split("SPU")
html = html[0]
regex = "title=\"(.*?)\" class.*?swiper-lazy.*?//(.*?.jpg).*?>"
else:
print("无轮播图:" + url)
html = html.split('item-pic')
html = html[1]
html = html.split("点击看更多图片")
html = html[0]
regex = ">()<a.*?//(.*?.jpg).*?>"
# 匹配正则式
datas = re.findall(regex, html)
# for data in datas:
# print(data)
return datas
def get_repetition_photo_forsql(): """获得数据库里面重复图集的数据""" conn = pymysql.connect(sqlName, sqluser, sqlpass, database)
# 获取mysql游标
cursor = conn.cursor()
sql = "select photo_id FROM phone_photo \
where photo_id not in \
( \
select min(photo_id) as id from phone_photo GROUP BY photo_min \
);"
try:
cursor.execute(sql)
data = cursor.fetchall()
except Exception as e:
# 回滚数据
conn.rollback()
print("失败:" + sql)
cursor.close()
conn.close()
return data
if __name__ == "__main__":
"""第一步,更新基础数据"""
# url = "http://detail.zol.com.cn/cell_phone_index/subcate57_list_"
url = "http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_9_2_0_"
for i in range(min_page, sum_page):
datas = get_phone_datas(url + str(i) + ".html")
for data in datas:
set_phone_one(data)