马上开学了最近想写个京东商品的可视化当作业。
想爬点数据,爬虫好写俩小时写完了。但是被反爬难住了,我也加了代{过}{滤}理池。 还是顶多爬10来页其他页面就会被京东方 返回一个京东主页或者登录页面。我明明加了代{过}{滤}理池(可用 ,因为是买的。)
用到了协程爬虫 跟 线程池解析。 我不大懂京东的反爬到底是怎么样的。是有哪个标识让我被发现是同一个人了所以不返回数据给我了吗。求大神给个思路。
下面是代码主体 代码我感觉我写的没有问题吧!!!。把孩子难住了。
[Python] 纯文本查看 复制代码 import asyncioimport time
import aiohttp
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, as_completed
import pymysql
import openpyxl
from urllib.parse import quote
from proxy import proxy_pool
import random
from User_Agents import user_agents
from queue import Queue
# 爬虫 主体
async def craw(session, url, brand):
headers = {
'User-Agent': random.choice(user_agents),
'cookie': '__jdu=543465755; areaId=5; ipLoc-djd=5-199-217-0; PCSYCityID=CN_130000_130600_130633; shshshfpa=13ca8de1-04ac-3bae-7222-e30fa930c1a1-1630242811; shshshfpb=cx9Nkx2hRpFdHv0XwI5j1TQ==; qrsc=3; mt_xid=V2_52007VwcXUFhfVlwbTSlYVTAAEQFdWU5STRofQAAyURpOVQ1SCQMdEV5QZ1RFUwlRUggvShhfBXsCG05fWUNZHkIaXw5kBiJSbVhiXxhLH1sBYAoVVW1cVlkd; user-key=bc652380-4a33-441f-83ea-17b56c5e8f52; unpl=V2_ZzNtbRYFRxEhXEVXKUxYAWIEE1pKB0AScglGVnJJDwZmBBpaclRCFnUUR11nG1wUZgsZXUVcRxFFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8cXgBgBxRZRmdzEkU4dlN7HF0NVwIiXHIVF0l2DE5VeBwRAWIBF1pGUUcRRQl2Vw==; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_ec55ee22cd5546069a2660138ab20696|1630372615041; TrackID=1Lo3Cg2_osEI2dxgsHA6rJrxT2Rtl7zDKvek10pvURVqTHlgms-7Dcf3vPEUk1WmpXhiaO9ejVE2g_fFA9rWa8Yo66Fv1sQS8v_WuTE1QxZM; pinId=NfK4XipiezV-79WOM01o_LV9-x-f3wj7; pin=jd_679a0481a6207; unick=Wxylkxy1; _tp=o7HPTpmAt7ABNFdzVxf8iWKS+WMMihBEDDz41ijDj34=; _pst=jd_679a0481a6207; shshshfp=5a927f7a07e443b22409e52397fe48ee; __jdc=122270672; __jda=122270672.543465755.1630242809.1630462427.1630480472.12; __jdb=122270672.1.543465755|12.1630480472; shshshsID=3d8ccee13f04c744608c65e625623d50_1_1630480471968; rkv=1.0; 3AB9D23F7A4B3C9B=7JFRI4DKH7ICDO4PRGNTOLGN4LKCFHOOS7TPM5A24LYWHXGRULAU4DMH4GEKDAVTWO52V5TY7Y3QVAZLQYMXSUSUNI',
'referer': url
}
try:
# 设置代{过}{滤}理
async with session.get(url=url, headers=headers, proxy=random.choice(proxy_list)) as response:
#async with session.get(url=url, headers=headers) as response:
res = await response.text() # 拿到返回的response字符串
# 以 len(response)的大小 判断是为否为我们想要的 页面源码
if len(res) < 300:
raise Exception # 为真 说明不是我们想要的源码 直接raise异常 跳到except
else:
return (brand, res) # 为假 说明是我们想要的数据 直接return
# 被反爬后 将url加入到 fail_url列表 然后return None
except:
fail_url.append(brand + '-' + url)
return None
async def asy_main(urls, html_queue):
sem = asyncio.Semaphore(10) # 设置信号量
print('asy craw ...') # 标识爬虫开始
async with sem:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
craw_tasks = [asyncio.create_task(craw(session, i.split('-', 1)[1], i.split('-', 1)[0])) for i in urls]
done, pending = await asyncio.wait(craw_tasks)
print('asy craw COMEPLETE ...')
html_queue.put(done)
def parse(tup):
list = []
html = tup[1]
brand = tup[0]
tree = etree.HTML(html)
names = tree.xpath('/html/body//li[@class="gl-item"]')
for i in names:
dict = {}
if i.xpath('./@data-spu')[0]:
name = ''.join(i.xpath('./div//div[contains(@class,"p-name p-name-type-2")]/a/em/text()'))
dict['name'] = name
list.append(dict)
else:
pass
print(len(list))
return (brand, list)
def multi_main(html_queue):
print('multi parse ...')
# 连接 mysql
conn = pymysql.connect(host='192.168.204.128',
port=3306,
user='xxxxx',
password='xxxxxx',
db='text',
charset='utf8')
# 开启 线程池 并且 解析
with ThreadPoolExecutor() as pool:
futures = []
for i in range(1, html_queue.qsize() + 1):
done = html_queue.get()
for html in done:
if html.result():
future = pool.submit(parse, (html.result()[0], html.result()[1]))
futures.append(future)
else:
pass
for future in as_completed(futures): # 以as_completed遍历futures 并且执行 sql语句
tup = future.result()
brand = tup[0]
list = tup[1]
for dict in list:
cursor = conn.cursor()
sql = 'insert into jd(brand,name) values(%s,%s)'
cursor.execute(sql, (brand, dict['name']))
conn.commit()
cursor.close()
print('multi parse COMPLETE ...')
# 关闭连接
conn.close()
# 返回urls 格式为 brand-url (str 类型)
def geturls():
print('GET URL ...')
urls = []
wb = openpyxl.load_workbook('url_info.xlsx')
ws = wb.active
scope = ws.iter_rows(min_row=2, max_row=2, min_col=1, max_col=3)
for rows in scope:
for page in range(1, 2 * int(rows[1].value) + 1, 2):
a = rows[0].value + '-' + str(rows[2].value + f'&cid2=671&page={page}')
urls.append(a)
for page in range(2, 2 * int(rows[1].value) + 1, 2):
b = rows[0].value + '-' + f'https://search.jd.com/s_new.php?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7' \
f'%94%B5%E8%84%91&ev=exbrand_{quote(rows[0].value)}%5E&cid3=672&cid2=671&page={page}&s=1&scrolling=y'
urls.append(b)
print('CONMPLETE ...')
return urls
if __name__ == '__main__':
fail_url = [] # 定义一个全局列表
urls = geturls() # 拿到urls
print('共', len(urls), '页数据') # 输出 urls的数量
user_agents = user_agents() # 设置随机UA
proxy_list = proxy_pool() # 实例化代{过}{滤}理池
html_queue = Queue() # 实例化一个queue 用来缓冲
# 基于协程开始爬虫
loop = asyncio.get_event_loop()
loop.run_until_complete(asy_main(urls, html_queue))
while True:
if len(fail_url):
fail_url_ = list(set(fail_url)) # 去重
time.sleep(3)
print(f'未成功的url---{len(fail_url_)}')
loop = asyncio.get_event_loop()
loop.run_until_complete(asy_main(fail_url_, html_queue))
else:
print('爬取全部完成!!!')
break
# 基于多进程 解析html 并且写入mysql
multi_main(html_queue)
|