[Python] 纯文本查看 复制代码
import random
import re
import time
import chardet
import requests
from fake_useragent import UserAgent
from lxml import etree
p = 0
curPage = 1
link_list = []
ua = UserAgent()
header = {
'authority': 'sf.taobao.com',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://sf.taobao.com/item_list.htm?&category=50025969&province=%D6%D8%C7%EC&sorder=1&page=1',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': '',
}
params = (
('category', '50025969'),
('province', '%D6%D8%C7%EC'),
('sorder', '1'),
('page', '1'),
)
# 获取网页内容
def gethtml(url):
response = requests.get(url, headers=header, params=params)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])
return r_response
def gethtml_detail(url):
response = requests.get(url, headers=header, params=params)
encodingInfo = chardet.detect(response.content)
r_response = response.content.decode(encodingInfo['encoding'])
return r_response
def parse_url(html):
ult = re.findall(r'(sf-item[\S]+)\?', html) # 详情链接
for i in range(len(ult)):
detai_url = "https://" + ult[i].replace('"', "") # 房屋详情
link_list.append(detai_url)
return link_list
def parse_url_detail(r):
html = etree.HTML(r)
try:
final_link = "https:" + html.xpath('//*[@id="J_NoticeDetail"]/@data-from')[0].strip()
except Exception as e:
print(str(e), "代码异常,链接返回空值")
final_link = ' '
return final_link
# 翻页
def next_page():
url_np = 'https://sf.taobao.com/item_list.htm?&category=50025969&province=%D6%D8%C7%EC&sorder=1&page={}'
url_list = [url_np.format(i + 1) for i in range(0, curPage)]
return url_list
# 主程序
def run_AL():
page = next_page()
for i in page:
html = gethtml(i)
l_list = parse_url(html)
# print(len(l_list))
for u in l_list[0:40]:
html_detail = gethtml_detail(u)
parse = parse_url_detail(html_detail)
print(parse)
if __name__ == '__main__':
run_AL()