[Python] 纯文本查看 复制代码
import reimport time
import chardet
import requests
from fake_useragent import UserAgent
from lxml import etree
p = 0
curPage = 1
link_list = []
ua = UserAgent()
header = {
'authority': 'sf.taobao.com',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'document',
'referer': 'https://sf.taobao.com//item_list.htm/_____tmd_____/punish?x5secdata=5e0c8e1365474455070961b803bd560607b52cabf5960afff39b64ce58073f78849a367443dd53565277002f507099b27b385fff7bc17a03cafdfd3769a9e81855b38bd923f8e97fc47c28363b278d8b65b0309398014db5684e8a0ec481572e461ee819ca12264cfd380e1ff9a31817142dfbe5985ded331617d6d6e410d9338b850c4c8f5bc88e9b210d3c07b0f0f94ab750123a60d4e03982ed34693d238ada92297ed56a297b527fabd4fb0db2aa5fd102e32437f7b8741ec5aa54e4ecf042adeb9e9e60bd20ec8b0196dcfea1b2f9b86c27692bf843654469bc1ede3d7d31456b32ac99bd8d798d7e75290032c81a626a42548bb6ebdecd804bf573367db546d5a02a05c67cd3736b8b85f1bc2f325c1ed71d8363d6a82d206f2b6e9ac8874359d89eb3048a2f5487e9af5b14a6779425216103151458d986536e9c2e21262bc194ca0504960bb2bcab9457150202ea47f0bf96b7b64b20bd4a6354a4f48016f1fad0c223a14ee56e7f7b5ed077bea6176f46259447ea395abb6e8c63396bdbac16e570058ecae709eb92b09fd6e78901cfbb5dd479e139ef258a369e861811091827832e42350e7fbe9070541493e057bb641274b7602a6b1d36bce75440cb601c92195df53ae3b1dd87d07280a3cfd300ba9f4ca7159f839b68a9c8342bf329aed0749c6628d188157b3691639e4ad85ab8c178561b8c28fc0cb8b1ac304a28e0740b5aa3a5ad9de693d9b491228b19c0162b27b3f46acf84e37a4ebccd45c1c67a54c15f480b5583eb10c67e&x5step=100',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'bx-extrainfo=%22%7B%22uuid%22%3A%2212105134e93eae512c2f541f65531bfd%22%7D%22; enc=y6%2FTVWd9LtWbLfCL76yEL0EuqrZPmyrxJHpbFsHcS3K%2FD8n34djfqYIBzPrJTLDycRgP8x951sqr%2BF5XYk9YtQ%3D%3D; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; t=9a3765df6d10e8078bc1e7712e1aa62f; _uab_collina=161979669118399095767531; _m_h5_tk=1ba63be2acdb6e589ab3b1d869f285e2_1620230164756; _m_h5_tk_enc=7afd33f3a35a1620dbd456d498d58b44; xlly_s=1; _samesite_flag_=true; cookie2=2851b8ef767bb9862438e6fe8a2a96b2; _tb_token_=e33de83eb15f3; mt=ci=0_0; cna=Q0XiGFiPhXwCAX1U3diPin0k; sgcookie=E1003VXrZ99lRUdWgsGSFW7QbdgrlFkDcNHLELTVIqmgGGNqqlzY0AzPa5J5cPPVOK8W30GkXNlzm8d5JGjIjrxnlg%3D%3D; unb=2211634474879; uc1=pas=0&existShop=false&cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&cookie21=VT5L2FSpdiBh&cookie14=Uoe2zXnHVRs%2Fdg%3D%3D&cookie15=UtASsssmOIJ0bQ%3D%3D; uc3=nk2=F5RBx%2BPs3EeiLeij&lg2=VT5L2FSpMGV7TQ%3D%3D&vt3=F8dCuwlOSihjB8oGk2I%3D&id2=UUpgR1TPcn8%2BbWCyNw%3D%3D; csg=e84e539f; lgc=tb4237281554; cookie17=UUpgR1TPcn8%2BbWCyNw%3D%3D; dnk=tb4237281554; skt=0efb6dba99761212; existShop=MTYyMDM1NzY1NA%3D%3D; uc4=nk4=0%40FY4KoqOjVSrWDYX%2Br0Rx%2Bw%2B0NFk5Uy4%3D&id4=0%40U2gqyOkBSuhguKHu9HohvbdPXiN42Bw6; tracknick=tb4237281554; _cc_=Vq8l%2BKCLiw%3D%3D; _l_g_=Ug%3D%3D; sg=498; _nk_=tb4237281554; cookie1=VTk5FPsnMDkCUZfLauJquHjDiKNrWEguPWyZtbQuL7U%3D; tfstk=cvYdB0vXfADnRZC9gHniP1rYtxJdachdQ214yX_LQ8zbBzauzsVDoEKDkJ6YpNhO.; isg=BOzsPDPwoCvwIbQRYjZQ5Ii1vcoepZBPRSWuk0YqKRc6UZxbQLEY3iSgcRlpWcin; l=eBa7OwjejpK8s-POBO5alurza77TEBdXcsPzaNbMiInca1yF_F_uXNCCe0WW8dtjgtfbzKxPbQoCnRhkPmUU-AkDBeYCPlUOrxv9-',
}
params = (
('spm', 'a213w.7398504.filter.2.3eb33a49s7ELKd'),
('category', '50025969'),
('auction_source', '0'),
('province', '%D6%D8%C7%EC'),
('sorder', '1'),
('st_param', '-1'),
('auction_start_seg', '-1'),
)
# retry_count = 5
# proxy = get_proxy().get("proxy")
# while retry_count > 0:
# try:
# html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)})
# # 使用代{过}{滤}理访问
# return html
# except Exception:
# retry_count -= 1
# # 删除代{过}{滤}理池中代{过}{滤}理
# delete_proxy(proxy)
# 获取网页内容
def gethtml(url):
response = requests.get(url, headers=header, params=params)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])
return r_response
def gethtml_detail(url):
retry_count = 4
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
response = requests.get(url, headers=header, params=params,proxies = {"http": "http://{}".format(proxy)})
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])
return r_response
except Exception:
retry_count -= 1
delete_proxy(proxy)
return None
def parse_url(html):
ult = re.findall(r'(sf-item[\S]+)\?', html) # 详情链接
for i in range(len(ult)):
detai_url = "https://" + ult[i].replace('"', "") # 房屋详情
link_list.append(detai_url)
return link_list
def parse_url_detail(r):
html = etree.HTML(r)
try:
final_link = "https:" + html.xpath('//*[@id="J_NoticeDetail"]/@data-from')[0].strip()
except Exception as e:
print(str(e), "代码异常,链接返回空值")
final_link = ' '
return final_link
# 翻页
def next_page():
url_np = 'https://sf.taobao.com/item_list.htm?&category=50025969&province=%D6%D8%C7%EC&sorder=1&page={}'
url_list = [url_np.format(i + 1) for i in range(0, curPage)]
return url_list
# 获取随机代{过}{滤}理
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
# 主程序
def run_AL():
page = next_page()
for i in page:
html = gethtml(i)
l_list = parse_url(html)
# print(len(l_list))
for u in l_list[0:40]:
html_detail = gethtml_detail(u)
parse = parse_url_detail(html_detail)
time.sleep(4)
print(parse)
if __name__ == '__main__':
run_AL()