本帖最后由 yx_robert 于 2018-11-16 02:08 编辑
练练手
顺便解决点实际问题
求助xpath
用的还不是很顺手
有写的很丑陋的地方
求大神指点
[Python] 纯文本查看 复制代码 #! /usr/bin/env python
# -*- coding: UTF-8 -*-
from lxml import etree
import requests
import sys
reload(sys)
sys.setdefaultencoding('gbk')
def gbk_2_utf(_str):
return _str.decode('gbk').encode('UTF-8')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
'Referer': 'https://www.52pojie.cn/forum-16-1.html'
}
save_file = u'搜索结果.txt'
# url_soft = 'https://www.52pojie.cn/forum-16-1.html'
# url_code = 'https://www.52pojie.cn/forum-24-1.html'
# url_movie = 'https://www.52pojie.cn/forum-56-1.html'
# Windows
# [url=https://www.52pojie.cn/forum.php?mod=forumdisplay&fid=16&filter=typeid&typeid=231]https://www.52pojie.cn/forum.php ... r=typeid&typeid=231[/url] #windows
# [url=https://www.52pojie.cn/forum.php?mod=forumdisplay&fid=16&typeid=231&filter=typeid&typeid=231&page=1]https://www.52pojie.cn/forum.php ... d&typeid=231&page=1[/url]
# [url=https://www.52pojie.cn/forum.php?mod=forumdisplay&fid=16&typeid=231&typeid=231&filter=typeid&page=13]https://www.52pojie.cn/forum.php ... lter=typeid&page=13[/url]
# 辅助软件
# [url=https://www.52pojie.cn/forum.php?mod=forumdisplay&fid=16&filter=typeid&typeid=289]https://www.52pojie.cn/forum.php ... r=typeid&typeid=289[/url]
# [url=https://www.52pojie.cn/forum.php?mod=forumdisplay&fid=16&typeid=289&filter=typeid&typeid=289&page=2]https://www.52pojie.cn/forum.php ... d&typeid=289&page=2[/url]
main_web = 'https://www.52pojie.cn/'
url = 'https://www.52pojie.cn/forum.php?mod=forumdisplay&fid=16&typeid=231&filter=typeid&typeid=231&page=%d'
max_pag = 50
filter_str = 'amp;'
tar_str = u'百度'
# tar_str = ''
def main():
with open(save_file, 'w') as f:
for i in range(1, max_pag + 1):
cur_url = url % i
req = requests.get(cur_url, headers=headers)
req.encoding = 'gbk'
# print req.text
root = etree.HTML(req.text)
# res = root.xpath('//*[@href="javascript:;"]/@class')
# result1 = html.xpath('//li[contains(@class,"aaa") and @name="fore"]/a/text()')
name_list = root.xpath(
'//a[contains(@href, "forum.php?") and @onclick="atarget(this)"]/text()')
url_list = root.xpath(
'//a[contains(@href, "forum.php?") and @onclick="atarget(this)"]/@href')
if len(name_list) == len(url_list):
for idx in range(0, len(url_list)):
if tar_str == '':
f.write(name_list[idx] + '\n')
f.write(main_web + url_list[idx] + '\n\n\n')
else:
if name_list[idx].find(tar_str) != -1:
f.write(name_list[idx] + '\n')
f.write(main_web + url_list[idx] + '\n\n\n')
# break
f.close()
if __name__ == "__main__":
main()
|