我写的多线程爬虫 运行一下 就会卡主 哪里有问题呀?
url="https://www.tianyabook.com/list/7/"headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
url2="https://www.tianyabook.com"
q=Queue(maxsize=30)
#-----------------------------------------------------
def mkdir(path:str,path2:str):
isexists=os.path.exists(path+"/"+path2)
if not isexists:
os.makedirs(path+"/"+path2)
print("{}/{}创建成功".format(path,path2))
else:
print("{}/{}已经存在".format(path,path2))
#=====================================================
dict={}
def scz(ip,sname):
#lock.acquire()
r =requests.get(ip,headers=headers)
r.encoding="gbk"
soup=BeautifulSoup(r.text,"lxml")
s=soup.find("div",class_="panel panel-default",id="list-chapterAll")
start=time.time()
for i in s.find_all("dd"):
for k in i.find_all("a"):
if k.get("href") is not None:
#print(k.getText(),k.get("href"))
#down(k.get("href"),k.getText(),sname)
q.put((k.get("href"),k.getText(),sname))
#end=time.time()
#lock.release()
#print("{}所有章节下载完毕,共用时{}".format(sname,(end-start)))
#filename="{}\{}.txt".format("test",sname)
#with open(filename) as f:
#f.write((end-start))
def xfz(shuming:str):
#db=pymysql.connect("10.197.201.221","root","Phah.123","python")
#cs=db.cursor()
ip=q.get()
print("{}这是目录章节的地址".format(ip))
#name=q.get()
#print("{}这是章节名字".format(name))
r=requests.get(url2+ip,headers=headers)
r.encoding="gbk"
soup=BeautifulSoup(r.text,"lxml")
s=soup.find("div",class_="panel-body",id="htmlContent")
s2=soup.find("h1",class_="readTitle")
zj=s2.getText()
#print(name)
#print(s.getText())
text=str(s.getText())
print(text)
filename="{}\{}\{}.txt".format("test",shuming,zj)
with open(filename,'a+',encoding='utf-8') as f:
f.write(text)
print("{}写入成功".format(zj))
if __name__ =="__main__" :
thread=[]
for i in range(1,2):
r =requests.get(url+str(i)+".html",headers=headers)
r.encoding="gbk"
soup=BeautifulSoup(r.text,'lxml')
s=soup.find("div",class_="col-md-8")
time.sleep(5)
print("这是第{}页".format(str(i)))
start=time.time()
for j in s.find_all("a"):
if j.get("title") is not None:
print(j.get("title")+"\n"+j.get("href"))
sm=os.path.exists(str(j.get("title")))
dict["shuming"]=str(j.get("title"))
dict["ip"]=str(j.get("href"))
mkdir("test",dict["shuming"])
print(dict["shuming"],str(j.get("href")))
t1=threading.Thread(target=scz,args=(dict["ip"],dict["shuming"]))
t1.start()
t2=threading.Thread(target=xfz,args=(dict["shuming"],))
t3=threading.Thread(target=xfz,args=(dict["shuming"],))
t4=threading.Thread(target=xfz,args=(dict["shuming"],))
t2.start()
t3.start()
t4.start()
t1.join()
t2.join()
t3.join()
t4.join()
end=time.time()
print("耗时{}".format((end-start)))
我用了 队列 看看也没地方错,运行了一下就会卡主了 不知道你来有问题 url="https://www.tianyabook.com/list/7/"
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
url2="https://www.tianyabook.com"
q=Queue(maxsize=30)
#-----------------------------------------------------
def mkdir(path:str,path2:str):
isexists=os.path.exists(path+"/"+path2)
if not isexists:
os.makedirs(path+"/"+path2)
print("{}/{}创建成功".format(path,path2))
else:
print("{}/{}已经存在".format(path,path2))
#=====================================================
dict={}
def scz(ip,sname):
#lock.acquire()
r =requests.get(ip,headers=headers)
r.encoding="gbk"
soup=BeautifulSoup(r.text,"lxml")
s=soup.find("div",class_="panel panel-default",id="list-chapterAll")
start=time.time()
for i in s.find_all("dd"):
for k in i.find_all("a"):
if k.get("href") is not None:
#print(k.getText(),k.get("href"))
#down(k.get("href"),k.getText(),sname)
q.put((k.get("href"),k.getText(),sname))
#end=time.time()
#lock.release()
#print("{}所有章节下载完毕,共用时{}".format(sname,(end-start)))
#filename="{}\{}.txt".format("test",sname)
#with open(filename) as f:
#f.write((end-start))
def xfz(shuming:str):
#db=pymysql.connect("10.197.201.221","root","Phah.123","python")
#cs=db.cursor()
ip=q.get()
print("{}这是目录章节的地址".format(ip))
#name=q.get()
#print("{}这是章节名字".format(name))
r=requests.get(url2+ip,headers=headers)
r.encoding="gbk"
soup=BeautifulSoup(r.text,"lxml")
s=soup.find("div",class_="panel-body",id="htmlContent")
s2=soup.find("h1",class_="readTitle")
zj=s2.getText()
#print(name)
#print(s.getText())
text=str(s.getText())
print(text)
filename="{}\{}\{}.txt".format("test",shuming,zj)
with open(filename,'a+',encoding='utf-8') as f:
f.write(text)
print("{}写入成功".format(zj))
if __name__ =="__main__" :
thread=[]
for i in range(1,2):
r =requests.get(url+str(i)+".html",headers=headers)
r.encoding="gbk"
soup=BeautifulSoup(r.text,'lxml')
s=soup.find("div",class_="col-md-8")
time.sleep(5)
print("这是第{}页".format(str(i)))
start=time.time()
for j in s.find_all("a"):
if j.get("title") is not None:
print(j.get("title")+"\n"+j.get("href"))
sm=os.path.exists(str(j.get("title")))
dict["shuming"]=str(j.get("title"))
dict["ip"]=str(j.get("href"))
mkdir("test",dict["shuming"])
print(dict["shuming"],str(j.get("href")))
t1=threading.Thread(target=scz,args=(dict["ip"],dict["shuming"]))
t1.start()
t2=threading.Thread(target=xfz,args=(dict["shuming"],))
t3=threading.Thread(target=xfz,args=(dict["shuming"],))
t4=threading.Thread(target=xfz,args=(dict["shuming"],))
t2.start()
t3.start()
t4.start()
t1.join()
t2.join()
t3.join()
t4.join()
end=time.time()
print("耗时{}".format((end-start)))
看着有点小乱,这样好一点吧 我在等答案!~{:1_918:} 多谢楼上,运行一下 下了三个就卡住了。。。。。。。。。。。。。。。 hahawangzi 发表于 2021-1-21 10:51
多谢楼上,运行一下 下了三个就卡住了。。。。。。。。。。。。。。。
把导入库那部分也加上吧 如果不是你代码的原因就是网络超时,requests.get(,,timeout=30)
还有就是线程太多了被web检查到了断开 hahawangzi 发表于 2021-1-21 10:51
多谢楼上,运行一下 下了三个就卡住了。。。。。。。。。。。。。。。
我只是把楼主的代码给格式化了下 ;www 要加代{过}{滤}理IP的 同一个IP请求过快,网站会ban你的
我用FD补发了200个请求就被ban了
技术贴,等大佬 我在等答案!~{:1_918:}
页:
[1]
2