python编写的一个爬取补天厂商标题对应百度查找主域名的一个收集脚本
刚注册吾爱论坛,没什么发的就发个爬虫脚本吧,刷洞的时候去年写的一个脚本使用的时候修改你访问补天网站的cookie就行
然后这是代码实例运行图
最后附上去年爬的一个2000个厂商列表吧
#coing=utf-8
#author:Liod
import requests,re,json
class butian(object):
def __init__(self, page):
self.page = page
self.butian_url = "http://loudong.360.cn/Reward/pub"
#self.proxies = {"http":"113.214.13.1:8000"}
self.data = {
"s":1,
"p":self.page,
"token":""
}
def bananer(self):
page = self.page
self.header = {
"Cookie":"", #COOKIE
"Host":"loudong.360.cn",
"Referer":"http://loudong.360.cn/Service",
"User-Agent":"Mozilla/5.0 (Linux; U; Android 5.1; zh-cn; m1 metal Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.6 Mobile Safari/537.36",
"Origin":"http://loudong.360.cn",
"Accept":"application/json, text/javascript, */*; q=0.01",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Accept-Encoding":"gzip, deflate",
"Content-Length":'14',
"Connection":"keep-alive",
"Accept-Language":"zh-CN,zh;q=0.8"
}
return self.header
def butianjson(self):
self.res = requests.post("http://loudong.360.cn/Reward/pub", headers = self.bananer(), data = self.data)
print self.res.content
self.content = json.loads(self.res.content)
result = []
for i in range(0, len(self.content["data"]["list"])-1):
result.append(self.content["data"]["list"]["company_name"])
return result
class baidu(object):
def __init__(self):
self.url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E5%B9%BF%E5%B7%9E%E8%A7%86%E6%BA%90%E8%82%A1%E4%BB%BD%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8"
self.bananer = {
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36",
"Cookie":"BAIDUID=A8AC42B1F46CDE7379A037C75CB62819:FG=1; BIDUPSID=A8AC42B1F46CDE7379A037C75CB62819; PSTM=1509928743; BDSFRCVID=W2AsJeCCxG3wqIbA3H_73bWlRYwArbZtRVBJ3J; H_BDCLCKID_SF=tRk8oDDafCvbfP0k54r-hICShUFX5-CsQbrCQhcH0hOWsIO6KfrDLjtnBNte5qbQLH5f54otytbCSlo_DUC0-nDSHHK8Jj8O3J; BD_UPN=123353; H_PS_645EC=87d0k6j1zJCm9Ri%2Fyz1u3cOEnpeK5T6s2yB7SB5VJZU3itkGx%2FAeu%2BGEwAs; BD_CK_SAM=1; PSINO=2; BDSVRTM=159; H_PS_PSSID=1426_12896_21106_17001_24879; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598"
}
self.proxies = {"http": "113.214.13.1:8000"}
def save_txt(self, url):
file = open("test111saa.txt", "a+")
file.write("%s\r\n"%url)
file.close()
def connect_baidu(self, url):
self.url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%s"%url
self.res = requests.get(self.url, headers = self.bananer, proxies = self.proxies, timeout=10)
self.result = re.findall(r'<div class="f13"><a target="_blank" href="(.*?)"', self.res.content)
baidulink = self.result
print baidulink
try:
res_url = requests.get(self.result, allow_redirects=True, timeout=10)
_url = res_url.url
except:
end_url = ""
return end_url
end_url = _url.split("/") + "//" + _url.split("/")
return end_url
if __name__ == "__main__":
# xxs = butian(1)
# print(xxs.butianjson())
#butian = butian(1).butianjson()
#for t in butian:
# xxs = baidu()
# baiduspider = xxs.connect_baidu(t)
# xxs.save_txt(baiduspider)
# print "FILE OK! %s"%baiduspider
for page in xrange(1, 200):
xxs = butian(page)
for t in xxs.butianjson():
print t
xxa = baidu()
baiduspider = xxa.connect_baidu(t)
xxa.save_txt(baiduspider)
print "FILE OK! %s" % baiduspider
#butian = xxs.butianjson()
#for t in butian:
#xxs = baidu()
#baiduspider = xxs.connect_baidu(t)
#xxs.save_txt(baiduspider)
#print "FILE OK! %s" % baiduspider
占个楼 谢谢分享 谢谢分享!!! 感谢分享!
感谢分享! 好东西 多谢分享 这个补天
是什么意思。。 babyhux 发表于 2018-3-17 11:00
这个补天
是什么意思。。
补天,360一个公益src的漏洞平台,挖洞的厂商列表有时候刷洞可以利用批量的思路,当然不仅是补天,盒子等等都可以爬取厂商列表,新人才来 厉害喽,大神每一个爬取背后都有一个故事。
页:
[1]
2