Python 爬取开源大赛的附件
本帖最后由 E飞翔 于 2018-12-1 16:58 编辑精易开源大赛的源码还是有很多比较好的东西的。
用Python写一下:
import re
from requests import post,get
import pprint
class JYKY:
def __init__(self,num):
""" 初始化num为页码 """
self.urls = 'https://bbs.125.la/forum.php?mod=forumdisplay&fid=98&typeid=585&typeid=585&filter=typeid&t=5459534&page=%d'%num
def getArticle(self):
""" 获取帖子链接和标题 """
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
res = get(self.urls,headers=header).text
Expression = r'<a href="\.(.*?html)" class="s xst">(.*?)</a>' # 优化后的表达式
re_data = re.findall(Expression,res)
# pprint.pprint(re_data) # 格式化输出帖子链接和标题
if re_data:
for i in re_data:
self.getAnnex(i)
def getAnnex(self,listApp):
""" 获取附件 """
# print(listApp)
url = 'https://bbs.125.la' + listApp
title = listApp
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Cookie':'', # 这里添加你的cookie
'Host':'bbs.125.la',
'Referer':self.urls
}
res = get(url,headers = header).text
# print(res)
re_data = re.findall(r'id=le3600_down:show&aid(.*?)"',res)
# pprint.pprint(re_data)
if re_data:
for i in re_data:
res_url = 'https://bbs.125.la/plugin.php?id=le3600_down:show&aid%s&infloat=yes&handlekey=le3600_down&inajax=1&ajaxtarget=fwin_content_le3600_down'%i
res_i = get(res_url,headers=header).text
rei_url = re.findall(r'a href="(http://att\.125\.la/plugin\.php\?id=le3600_down.*?)"',res_i)
rei_name = re.findall(r'<td class="css_dashed" colspan="3">(.*?\.)(rar|7z|e|zip)</td>',res_i)#(.+?\.)
# print(res_i)
# print(len(rei_url),len(rei_name))
if rei_name :
self.saveAnnex(rei_name,rei_url)
print(rei_url)
def saveAnnex(self,Annex_title,Annex_url):
Annex = get(Annex_url)
with open(r'C:\Users\Administrator\Desktop\MP3\%s' % Annex_title, 'wb') as f:
f.write(Annex.content)
if __name__ == '__main__':
p = JYKY(1) # 这里初始化一个页码
p.getArticle()
真是大神啊 这个可以有 不错,感谢分享 谢谢分享,一直有学,可感觉总没有进步 长见识了感谢 感谢分享,学习了 厉害啊,正在学习爬虫 这个运行结果是什么? 代码写的真好看 楼主辛苦了! 感谢楼主分享 点赞 {:1_921:}
页:
[1]