[Python] 纯文本查看 复制代码
# 拓展:爬取慢慢买网站"神价监控"数据
# http://tool2.manmanbuy.com/PriceLessSort.aspx
# 1. 获取JD降价商品内容,与转向连接/或者直接获取商品廉价->3
# 2. 获取去看看的url
# 3. 格式化处理url,生成纯净的商品名称价格信息与url
import requests
import re
import pandas as pd
# 全局变量
items = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
resp = requests.get(url=url, headers=headers)
text = resp.text
with open('manmanbuy.html', 'w', encoding='utf-8') as f:
f.write(text)
# 信息结构:r'<li class="proitem">(.*?)</li>'
item_times = re.findall(r'<li class="proitem">.*?<span>(.*?)</span>', text, re.S)
item_imgs = re.findall(r'<li class="proitem">.*?src="(.*?)".*?</div>', text, re.S)
item_names = re.findall(r'<li class="proitem">.*?title=.*?>(.*?)</a>', text, re.S)
# 商城
item_stores = re.findall(r'class=\'proinfo\'.*?alt=\'(.*?)\'', text, re.S)
# 商品链接
# tem_urls = re.findall(r'<li class="proitem">.*?href="(.*?)".*?</div>', text, re.S)
# JD纯链接
item_urls = re.findall(r'<li class="proitem">.*?href=".*?url=(.*?)".*?</div>', text, re.S)
# 原本价格,t7
item_original_prices = re.findall(r'class="t7">(.*?)</div>', text, re.S)
# 省钱
item_cuts = re.findall(r'<li class="proitem">.*?title="(.*?)"', text, re.S)
# 当前价格,t3
item__now_prices = re.findall(r'class="t3">(.*?)</div>', text, re.S)
# 折扣
item_discounts = re.findall(r'<li class="proitem">.*?title=".*?">(.*?)</a>', text, re.S)
# 历史最低价格,t5
item_lowests = re.findall(r'class="t5".*?<span style=.*?>(.*?)</span>', text, re.S)
# zip打包
for value in zip(item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts,
item__now_prices,
item_discounts, item_lowests):
item_time, item_store, item_name, item_img, item_url, item_original_price, item_cut, item__now_price, item_discount, item_lowest = value
# 字典,存入items列表
item = {'item_time': item_time,
'item_store': item_store,
'item_name': item_name,
'item_img': item_img,
'item_url': item_url,
'item_original_price': item_original_price,
'item_cut': item_cut,
'item__now_price': item__now_price,
'item_discount': item_discount,
'item_lowest': item_lowest,
}
items.append(item)
print(item_stores)
print(len(item_stores))
print(items)
# 测试数量是否正确
L = [item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts, item__now_prices,
item_discounts, item_lowests]
for i in L:
print(str(len(i)) + ',', end='')
return items
# 写入excel表格
def write_to_excel(items):
df = pd.DataFrame(items)
df.to_excel('items.xlsx', index=False)
def main(n):
for i in range(n):
# url_all = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1,6,13__0_1_0_8_6_2__0_0__s0'.format(i)
# 设置查询条件为jd,获取headers中的q参数.
url_jd = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(i)
print(url_jd)
parse_page(url_jd)
write_to_excel(items)
if __name__ == '__main__':
# 输入需要爬取的页数,执行主程序
main(100)