本帖最后由 苏紫方璇 于 2024-4-1 10:51 编辑
遇到asp.net分页'dopostback'的以及邮箱加密,实现读取后续页数内容以及对加密邮箱的解密。
当遇到分页情况时,网址并无变化,导致无法爬取后后续页数,经过多天研究百度找寻方法,发现这个情况都没有什么很好的解读,网址分页内容是靠post请求的,通过F12看到网址请求时有个表单数据,通过‘__EVENTTARGET’,‘__EVENTARGUMENT’这个代表页数,‘__VIESTATE’这三个带入请求参数,通过__EVENTARGUMENT的数值变化获取到不同页数内容。
最后实现的内容是JL后缀文件,详细请看附件
[Python] 纯文本查看 复制代码 import requests
import re
from bs4 import BeautifulSoup
from lxml import etree
import json
import threading
import time
def extract_data(page_num):
url = 'https://catalogue.ite-expo.ru/en-GB/exhibitorlist.aspx?project_id=520'
companies_data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
}
with requests.Session() as session:
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
event_target = 'p$lt$zoneContainer$pageplaceholder$p$lt$zoneForm$UniPager$pagerElem'
event_argument = str(page_num)
viewstate = soup.find('input', {'name': '__VIEWSTATE'})['value'] if soup.find('input',
{'name': '__VIEWSTATE'}) else ''
data = {
'__EVENTTARGET': event_target,
'__EVENTARGUMENT': event_argument,
'__VIEWSTATE': viewstate
}
try:
response = session.post(url, data=data, headers=headers, timeout=20)
except requests.Timeout:
print("请求超时!")
return
response.encoding = response.apparent_encoding
html = etree.HTML(response.text)
popUp = html.xpath("/html/body/form/section/div/div[2]/a/@href")
name = html.xpath("/html/body/form/section/div/div[2]/a/div[@class='name']/text()")
stand = html.xpath("/html/body/form/section/div/div[2]/a/div[@class='stand']/text()")
pattern = r'[A-Z]\d+'
stand_number = [re.search(pattern, sub_text).group() if re.search(pattern, sub_text) else '' for sub_text
in stand]
for idx, href in enumerate(popUp, start=1):
print(f"正在爬取第 {page_num} 页的第 {idx} 条信息...")
if not href.startswith('http'):
baseURL = 'https://catalogue.ite-expo.ru/'
href = baseURL + href
try:
response_next = requests.get(href, timeout=20)
except requests.Timeout:
print("请求超时!")
continue
if response_next.status_code == 200:
html_next = etree.HTML(response_next.text)
Telephone_number = html_next.xpath("/html/body/form/div[5]/div/div[5]/div/p/text()")
Website = html_next.xpath("/html/body/form/div[5]/div/div[6]/div/p/a/text()")
About_company = html_next.xpath("/html/body/form/div[5]/div/div[8]/div/p/text()")
emails = html_next.xpath("/html/body/form/div[5]/div/div[7]/div/p/a/@href")
pattern = r'/cdn-cgi/l/email-protection#'
if emails:
decoded_emails = []
for encoded_email in emails:
if not encoded_email.startswith('/cdn-cgi/l/email-protection#'):
continue
encoded_email = re.sub(pattern, '', encoded_email)
if not encoded_email:
continue
r = int(encoded_email[:2], 16)
decoded_email = ''.join(
[chr(int(encoded_email[i:i + 2], 16) ^ r) for i in range(2, len(encoded_email), 2)])
decoded_emails.append(decoded_email)
Email = ', '.join(decoded_emails)
else:
Email = None
clear_list = {
'公司名': name[idx - 1] if idx <= len(name) else None,
'展位号': stand_number[idx - 1] if idx <= len(stand_number) else None,
'电话': Telephone_number[0] if Telephone_number else None,
'官网': Website[0] if Website else None,
'邮箱': Email if Email else None,
'公司简介': About_company[0].strip() if About_company else None
}
companies_data.append(clear_list)
return companies_data
def main(num_threads, delay_seconds):
threads = []
results = []
for page_num in range(1, 26): # 假设有25页
thread = threading.Thread(target=lambda p: results.append(extract_data(p)), args=(page_num,))
threads.append(thread)
time.sleep(delay_seconds) # 在启动下一个线程之前等待一段时间
thread.start()
# 等待直到有可用线程
while threading.active_count() >= num_threads:
time.sleep(0.1)
# 等待所有线程完成
for thread in threads:
thread.join()
# 将数据保存为JSON Lines (jl) 文件
with open('companies_data.jl', 'w', encoding='utf-8') as jl_file:
for result in results:
for company in result:
json.dump(company, jl_file, ensure_ascii=False)
jl_file.write('\n')
print("数据爬取完成,已保存为companies_data.jl文件。")
if __name__ == "__main__":
num_threads = int(input("请输入线程数量:"))
delay_seconds = float(input("请输入线程之间的延迟时间(秒):"))
main(num_threads, delay_seconds)
|