Re练习,单线程直爬取慢慢买网站"神价监控"数据
本帖最后由 Anthony_52pj 于 2020-3-9 11:06 编辑rt,第一次发帖,瑟瑟发抖.
新人学习爬虫练手,直接上代码.
# 拓展:爬取慢慢买网站"神价监控"数据
# http://tool2.manmanbuy.com/PriceLessSort.aspx
# 1. 获取JD降价商品内容,与转向连接/或者直接获取商品廉价->3
# 2. 获取去看看的url
# 3. 格式化处理url,生成纯净的商品名称价格信息与url
import requests
import re
import pandas as pd
# 全局变量
items = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
resp = requests.get(url=url, headers=headers)
text = resp.text
with open('manmanbuy.html', 'w', encoding='utf-8') as f:
f.write(text)
# 信息结构:r'<li class="proitem">(.*?)</li>'
item_times = re.findall(r'<li class="proitem">.*?<span>(.*?)</span>', text, re.S)
item_imgs = re.findall(r'<li class="proitem">.*?src="(.*?)".*?</div>', text, re.S)
item_names = re.findall(r'<li class="proitem">.*?title=.*?>(.*?)</a>', text, re.S)
# 商城
item_stores = re.findall(r'class=\'proinfo\'.*?alt=\'(.*?)\'', text, re.S)
# 商品链接
# tem_urls = re.findall(r'<li class="proitem">.*?href="(.*?)".*?</div>', text, re.S)
# JD纯链接
item_urls = re.findall(r'<li class="proitem">.*?href=".*?url=(.*?)".*?</div>', text, re.S)
# 原本价格,t7
item_original_prices = re.findall(r'class="t7">(.*?)</div>', text, re.S)
# 省钱
item_cuts = re.findall(r'<li class="proitem">.*?title="(.*?)"', text, re.S)
# 当前价格,t3
item__now_prices = re.findall(r'class="t3">(.*?)</div>', text, re.S)
# 折扣
item_discounts = re.findall(r'<li class="proitem">.*?title=".*?">(.*?)</a>', text, re.S)
# 历史最低价格,t5
item_lowests = re.findall(r'class="t5".*?<span style=.*?>(.*?)</span>', text, re.S)
# zip打包
for value in zip(item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts,
item__now_prices,
item_discounts, item_lowests):
item_time, item_store, item_name, item_img, item_url, item_original_price, item_cut, item__now_price, item_discount, item_lowest = value
# 字典,存入items列表
item = {'item_time': item_time,
'item_store': item_store,
'item_name': item_name,
'item_img': item_img,
'item_url': item_url,
'item_original_price': item_original_price,
'item_cut': item_cut,
'item__now_price': item__now_price,
'item_discount': item_discount,
'item_lowest': item_lowest,
}
items.append(item)
print(item_stores)
print(len(item_stores))
print(items)
# 测试数量是否正确
L = [item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts, item__now_prices,
item_discounts, item_lowests]
for i in L:
print(str(len(i)) + ',', end='')
return items
# 写入excel表格
def write_to_excel(items):
df = pd.DataFrame(items)
df.to_excel('items.xlsx', index=False)
def main(n):
for i in range(n):
# url_all = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1,6,13__0_1_0_8_6_2__0_0__s0'.format(i)
# 设置查询条件为jd,获取headers中的q参数.
url_jd = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(i)
print(url_jd)
parse_page(url_jd)
write_to_excel(items)
if __name__ == '__main__':
# 输入需要爬取的页数,执行主程序
main(100) 继续优化1.获取总计爬取的页码,取消手动输入页码数.
2.存储到mongodb时,根据爬取的日期,来建立不同的集合.
后续学到新内容或者有新的想法再继续优化.{:1_918:}
import requests
import re
import pandas as pd
import time
import datetime
from threading import Thread
import pymongo
# 全局变量
items = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
resp = requests.get(url=url, headers=headers)
text = resp.text
# 信息结构:r'<li class="proitem">(.*?)</li>'
item_times = re.findall(r'<li class="proitem">.*?<span>(.*?)</span>', text, re.S)
item_imgs = re.findall(r'<li class="proitem">.*?src="(.*?)".*?</div>', text, re.S)
item_names = re.findall(r'<li class="proitem">.*?title=.*?>(.*?)</a>', text, re.S)
# 商城
item_stores = re.findall(r'class=\'proinfo\'.*?alt=\'(.*?)\'', text, re.S)
# 商品链接
item_urls = re.findall(r'<li class="proitem">.*?href=".*?url=(.*?)".*?</div>', text, re.S)
# 原本价格,t7
item_original_prices = re.findall(r'class="t7">(.*?)</div>', text, re.S)
# 省钱
item_cuts = re.findall(r'<li class="proitem">.*?title="(.*?)"', text, re.S)
# 当前价格,t3
item__now_prices = re.findall(r'class="t3">(.*?)</div>', text, re.S)
# 折扣
item_discounts = re.findall(r'<li class="proitem">.*?title=".*?">(.*?)</a>', text, re.S)
# 历史最低价格,t5
item_lowests = re.findall(r'class="t5".*?<span style=.*?>(.*?)</span>', text, re.S)
# zip打包
for value in zip(item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts,
item__now_prices,
item_discounts, item_lowests):
item_time, item_store, item_name, item_img, item_url, item_original_price, item_cut, item__now_price, item_discount, item_lowest = value
# 字典,存入items列表
item = {'item_time': item_time,
'item_store': item_store,
'item_name': item_name,
'item_img': item_img,
'item_url': item_url,
'item_original_price': float(item_original_price),
'item_cut': item_cut,
'item__now_price': float(item__now_price),
'item_discount': item_discount,
'item_lowest': float(item_lowest),
}
items.append(item)
return items
# 获取总计的页数
def get_page_nums():
url_jd = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(1)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
resp = requests.get(url=url_jd, headers=headers)
text = resp.text
num_tag = re.findall(r'<div id="ctl00_ContentPlaceHolder1_divPage".*?</font>\/(.*?)<font', text, re.S)
num = re.match('\d{2}.*?', num_tag).group()
return num
# 存储到excel表格
def write_to_excel(items):
df = pd.DataFrame(items)
df.to_excel('items.xlsx', index=False)
# 存储到mongodb
def save_to_mongodb(items):
# 查询日期
query_date = datetime.datetime.now().strftime('%Y-%m-%d')
client = pymongo.MongoClient("127.0.0.1", port=27017)
# 创建与指定数据库
db = client['jd']
# 创建与指定集合
# 根据不同的查询日期来创建集合
collection = db['products_%s' % query_date]
# 插入数据
collection.insert_many(items)
def main(n):
time.sleep(1)
url_jd = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(n)
print(url_jd)
parse_page(url_jd)
print('第%s页:目前总计 %s 条.\n' % (n, len(items)) + '-' * 50)
if __name__ == '__main__':
# 输入需要爬取的页数,执行主程序
num = int(get_page_nums())
print('总计 %s 页.' % num)
threads = []
for i in range(num):
t = Thread(target=main, args=(i,))
t.start()
threads.append(t)
for i in threads:
i.join()
print('爬取结束')
# print(items)
# 调整存储:为线程结束后将总列表,存储到表格/数据库.
write_to_excel(items)
save_to_mongodb(items) 学习了MongoDB
顺便更新优化下代码.
调整了存储数据的位置,新添了MongoDB的数据存储.
MongoDB还是很好用的.
import requests
import re
import pandas as pd
import time
from threading import Thread
import pymongo
# 全局变量
items = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
resp = requests.get(url=url, headers=headers)
text = resp.text
with open('manmanbuy.html', 'w', encoding='utf-8') as f:
f.write(text)
# 信息结构:r'<li class="proitem">(.*?)</li>'
item_times = re.findall(r'<li class="proitem">.*?<span>(.*?)</span>', text, re.S)
item_imgs = re.findall(r'<li class="proitem">.*?src="(.*?)".*?</div>', text, re.S)
item_names = re.findall(r'<li class="proitem">.*?title=.*?>(.*?)</a>', text, re.S)
# 商城
item_stores = re.findall(r'class=\'proinfo\'.*?alt=\'(.*?)\'', text, re.S)
# 商品链接
item_urls = re.findall(r'<li class="proitem">.*?href=".*?url=(.*?)".*?</div>', text, re.S)
# 原本价格,t7
item_original_prices = re.findall(r'class="t7">(.*?)</div>', text, re.S)
# 省钱
item_cuts = re.findall(r'<li class="proitem">.*?title="(.*?)"', text, re.S)
# 当前价格,t3
item__now_prices = re.findall(r'class="t3">(.*?)</div>', text, re.S)
# 折扣
item_discounts = re.findall(r'<li class="proitem">.*?title=".*?">(.*?)</a>', text, re.S)
# 历史最低价格,t5
item_lowests = re.findall(r'class="t5".*?<span style=.*?>(.*?)</span>', text, re.S)
# zip打包
for value in zip(item_times, item_stores, item_names, item_imgs, item_urls, item_original_prices, item_cuts,
item__now_prices,
item_discounts, item_lowests):
item_time, item_store, item_name, item_img, item_url, item_original_price, item_cut, item__now_price, item_discount, item_lowest = value
# 字典,存入items列表
item = {'item_time': item_time,
'item_store': item_store,
'item_name': item_name,
'item_img': item_img,
'item_url': item_url,
'item_original_price': float(item_original_price),
'item_cut': item_cut,
'item__now_price': float(item__now_price),
'item_discount': item_discount,
'item_lowest': float(item_lowest),
}
items.append(item)
return items
# 存储到excel表格
def write_to_excel(items):
df = pd.DataFrame(items)
df.to_excel('items.xlsx', index=False)
# 存储到mongodb
def save_to_mongodb(items):
client = pymongo.MongoClient("127.0.0.1", port=27017)
# 创建与指定数据库
db = client['jd']
# 创建与指定集合
collection = db['products']
# 插入数据
collection.insert_many(items)
def main(n):
time.sleep(1)
url_jd = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(n)
print(url_jd)
parse_page(url_jd)
print('第%s页:目前总计%s.\n' % (n, len(items)) + '-' * 50)
if __name__ == '__main__':
# 输入需要爬取的页数,执行主程序
threads = []
for i in range(13):
t = Thread(target=main, args=(i,))
t.start()
threads.append(t)
for i in threads:
i.join()
print('爬取结束')
# print(items)
# 调整存储:为线程结束后将总列表,存储到表格/数据库.
write_to_excel(items)
save_to_mongodb(items) 本帖最后由 Anthony_52pj 于 2020-3-9 21:18 编辑
使用多线程,提速
1.导入模块
#导入多线程与时间模块
from threading import Thread
import time
2.使用多线程,提升爬取速度
def main(n):
time.sleep(0.5)
url = 'http://tool2.manmanbuy.com/PriceLessSort.aspx?PageID={0}&q=1_0_0_1_0_8_6_2__0_0__s0_0'.format(n)
print(url)
parse_page(url)
print(len(items))
write_to_excel(items)
if __name__ == '__main__':
#调用多进程提升爬取速度
threads=[]
for i in range(63):
# 实例化线程对象,target是函数名,args是函数的参数,以元组形式表示
t=Thread(target=main,args=(i,))
#线程启动
t.start()
threads.append(t)
for i in threads:
#等待线程终止
i.join() Anthony_52pj 发表于 2020-3-9 16:54
使用多线程,提速
1.导入模块
#导入多线程与时间模块
期待写好 收藏了,以后再来研究一下 补MongoDB图 循序渐进,非常不错,赞一个! 想问下这个在哪里可以学习啊?想监控淘宝京东等网站的价格 yequanxi13 发表于 2020-6-25 15:22
想问下这个在哪里可以学习啊?想监控淘宝京东等网站的价格
B站看视频,看书,菜鸟教程,这些都可以
页:
[1]
2