[Python] 纯文本查看 复制代码
import re
import time
import random
import chardet
import requests
from lxml import etree
p = 0
curPage = 1
link_list = ['https://sf-item.taobao.com/sf_item/640824628883.htm', 'https://sf-item.taobao.com/sf_item/641502301303.htm', 'https://sf-item.taobao.com/sf_item/641843794084.htm', 'https://sf-item.taobao.com/sf_item/642251459421.htm', 'https://sf-item.taobao.com/sf_item/642193923780.htm', 'https://sf-item.taobao.com/sf_item/642194171464.htm', 'https://sf-item.taobao.com/sf_item/642254059021.htm', 'https://sf-item.taobao.com/sf_item/642633234548.htm', 'https://sf-item.taobao.com/sf_item/641674674943.htm', 'https://sf-item.taobao.com/sf_item/641860786177.htm', 'https://sf-item.taobao.com/sf_item/641833404613.htm', 'https://sf-item.taobao.com/sf_item/642906910580.htm', 'https://sf-item.taobao.com/sf_item/642930454182.htm', 'https://sf-item.taobao.com/sf_item/643246399434.htm', 'https://sf-item.taobao.com/sf_item/643107141190.htm', 'https://sf-item.taobao.com/sf_item/641423917301.htm', 'https://sf-item.taobao.com/sf_item/641126976926.htm', 'https://sf-item.taobao.com/sf_item/640899648258.htm', 'https://sf-item.taobao.com/sf_item/641285745911.htm', 'https://sf-item.taobao.com/sf_item/641870471695.htm', 'https://sf-item.taobao.com/sf_item/641051516853.htm', 'https://sf-item.taobao.com/sf_item/641299249850.htm', 'https://sf-item.taobao.com/sf_item/640900456736.htm', 'https://sf-item.taobao.com/sf_item/641435389754.htm', 'https://sf-item.taobao.com/sf_item/642820317425.htm', 'https://sf-item.taobao.com/sf_item/642186087592.htm', 'https://sf-item.taobao.com/sf_item/642151179689.htm', 'https://sf-item.taobao.com/sf_item/640825988107.htm', 'https://sf-item.taobao.com/sf_item/644021779506.htm', 'https://sf-item.taobao.com/sf_item/642666508870.htm', 'https://sf-item.taobao.com/sf_item/643049745347.htm', 'https://sf-item.taobao.com/sf_item/642662660309.htm', 'https://sf-item.taobao.com/sf_item/642664828068.htm', 'https://sf-item.taobao.com/sf_item/642663848703.htm', 'https://sf-item.taobao.com/sf_item/643740591181.htm', 'https://sf-item.taobao.com/sf_item/643395154262.htm', 'https://sf-item.taobao.com/sf_item/643398210037.htm', 'https://sf-item.taobao.com/sf_item/643047449184.htm', 'https://sf-item.taobao.com/sf_item/643738223811.htm', 'https://sf-item.taobao.com/sf_item/643050809962.htm']
header = {
'authority': 'sf.taobao.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'enc=',
}
params = (
('spm', 'a213w.7398504.filter.63.76d93a49nDn8tk'),
('auction_source', '0'),
('province', '%D6%D8%C7%EC'),
('sorder', '1'),
('st_param', '-1'),
('auction_start_seg', '-1'),
)
# 获取网页内容
def gethtml_detail(url):
proxies = get_random_ip(can_use)
response = requests.get(url, headers=header, params=params,proxies=proxies)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])
return r_response
# 获取网页数据
def parse_url(html):
ult = re.findall(r'(sf-item[\S]+)\?', html) # 详情链接
for i in range(len(ult)):
detai_url = "https://" + ult[i].replace('"', "") # 房屋详情
link_list.append(detai_url)
return link_list
def parse_url_detail(r):
html = etree.HTML(r)
final_link = "https:" + html.xpath('//*[@id="J_NoticeDetail"]/@data-from')[0].strip()
return final_link
# 主程序
def run_AL():
for u in link_list:
html_detail = gethtml_detail(u)
parse = parse_url_detail(html_detail)
print(parse)
if __name__ == '__main__':
proxies_list = get_ip_list()
can_use = check_ip(proxies_list)
run_AL()