小白学习python爬虫,分享一下代码(失败的代码)
常识了 可以获取到内容 但是没有想到用什么办法保存如果有大牛看到了 能帮忙改好就谢谢了
importrequests
from lxml import etree
import time
from threading import Thread
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_html(url):
try:
r=requests.get(url,headers=headers)
r.encoding='gbk'
html=etree.HTML(r.text)
time.sleep(2)
return html
except EnvironmentError as e:
return e
def get_xpath(html):
href = html.xpath('//div[@class="co_content8"]//tr//td//b//a//@href')
for i in href:
url_l='https://www.dytt8.net'+i
html_l=get_html(url=url_l)
data={}
data['headline']=html_l.xpath('//div[@class="title_all"]/h1/font/text()')
print(url_l)
zoom = html_l.xpath('.//div[@id="Zoom"]//.//text()')
for info in zoom:
if info.startswith("◎译 名"):
version=(info.replace("◎译 名", "").strip())
data['version']=version
if info.startswith("◎片 名"):
name=(info.replace("◎片 名",'').strip())
data['name']=name
if info.startswith("◎年 代"):
time=(info.replace('◎年 代','').strip())
data['time']=time
if info.startswith("◎产 地"):
place_of_origin=(info.replace('◎产 地','').strip())
data['place_of_origin']=place_of_origin
if info.startswith("◎类 别"):
category=(info.replace('◎类 别','').strip())
data['category']=category
if info.startswith('◎语 言'):
language=(info.replace('◎语 言','').strip())
data['language']=language
if info.startswith('◎字 幕'):
title=(info.replace('◎字 幕','').strip())
data['title']=title
if info.startswith('◎上映日期'):
release=(info.replace('◎上映日期','').strip())
data['release']=release
if info.startswith('◎豆瓣评分'):
grade=(info.replace('◎豆瓣评分','').strip())
data['grade']=grade
if info.startswith('◎导 演'):
director=(info.replace('◎导 演','').strip())
data['director']=director
if info.startswith('◎主 演'):
protagonist=(info.replace('◎主 演','').strip())
data['protagonist']=protagonist
data['lilk']=html_l.xpath('//div[@id="Zoom"]//a/@href')
print(data)
def main(start_url,end_url):
for i in range(start_url,end_url):
url="https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html".format(i)
data=get_html(url=url)
get_xpath(html=data)
if __name__ == '__main__':
thad=[]
t1=Thread(target=main,args=(1,10))
t2=Thread(target=main,args=(10,20))
t3=Thread(target=main,args=(20,30))
t4=Thread(target=main,args=(30,40))
t5=Thread(target=main,args=(40,50))
thad+=
for i in thad:
i.start()
for i in thad:
i.join()
利用线程锁,保持写入数据同步
我试着保存为txt文档,没问题啊
import requests
from lxml import etree
import time
from threading import Thread
####################################
import threading
####################################
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
#####################################利用线程锁,保持写入文件同步
lock = threading.Lock()
#####################################
def get_html(url):
try:
r = requests.get(url, headers=headers)
r.encoding = 'gbk'
html = etree.HTML(r.text)
time.sleep(2)
return html
except EnvironmentError as e:
return e
def get_xpath(html):
href = html.xpath('//div[@class="co_content8"]//tr//td//b//a//@href')
for i in href:
url_l = 'https://www.dytt8.net' + i
html_l = get_html(url=url_l)
data = {}
data['headline'] = html_l.xpath('//div[@class="title_all"]/h1/font/text()')
print(url_l)
zoom = html_l.xpath('.//div[@id="Zoom"]//.//text()')
############################################################锁住
lock.acquire()
############################################################
for info in zoom:
if info.startswith("◎译 名"):
version = (info.replace("◎译 名", "").strip())
data['version'] = version
if info.startswith("◎片 名"):
name = (info.replace("◎片 名", '').strip())
data['name'] = name
if info.startswith("◎年 代"):
time = (info.replace('◎年 代', '').strip())
data['time'] = time
if info.startswith("◎产 地"):
place_of_origin = (info.replace('◎产 地', '').strip())
data['place_of_origin'] = place_of_origin
if info.startswith("◎类 别"):
category = (info.replace('◎类 别', '').strip())
data['category'] = category
if info.startswith('◎语 言'):
language = (info.replace('◎语 言', '').strip())
data['language'] = language
if info.startswith('◎字 幕'):
title = (info.replace('◎字 幕', '').strip())
data['title'] = title
if info.startswith('◎上映日期'):
release = (info.replace('◎上映日期', '').strip())
data['release'] = release
if info.startswith('◎豆瓣评分'):
grade = (info.replace('◎豆瓣评分', '').strip())
data['grade'] = grade
if info.startswith('◎导 演'):
director = (info.replace('◎导 演', '').strip())
data['director'] = director
if info.startswith('◎主 演'):
protagonist = (info.replace('◎主 演', '').strip())
data['protagonist'] = protagonist
data['lilk'] = html_l.xpath('//div[@id="Zoom"]//a/@href')
###################################################################保存你想要的数据
with open('movice.txt','a+') asf:
f.write('译名:'+data['version']+'\n')
f.write('片名:'+data['name']+'\n')
f.write('年代:'+data['time']+'\n')
f.write('产地:'+data['place_of_origin']+'\n')
f.write('类别:'+data['category']+'\n')
f.write('上映时间'+data['release'])
f.write(3*'\n')
f.close()
lock.release()#释放
#####################################################################
print(data)
def main(start_url, end_url):
for i in range(start_url, end_url):
url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html".format(i)
data = get_html(url=url)
get_xpath(html=data)
if __name__ == '__main__':
thad = []
t1 = Thread(target=main, args=(1, 10))
t2 = Thread(target=main, args=(10, 20))
t3 = Thread(target=main, args=(20, 30))
t4 = Thread(target=main, args=(30, 40))
t5 = Thread(target=main, args=(40, 50))
thad +=
for i in thad:
i.start()
for i in thad:
i.join() 你流和我流 发表于 2019-8-25 17:14
with open('filename', 'a+') as f:
f.write('你想保存的数据')
......
数据有缺失值,保存不了csv。 看得似懂非懂,好想学,我是学习的:lol with open('filename', 'a+') as f:
f.write('你想保存的数据')
......
保存文件即可,还可以使用csv库,pandas库,百度一下你就知道 你都失败了,还分享出来干吗 ℡小疯、 发表于 2019-8-25 17:17
你都失败了,还分享出来干吗
就缺一步保存了,其他没错啊。 如果只获取前十页数据,用pandas保存csv就可以了。但是我写的是全部的 最近也想学 大佬牛皮,虽然不太清楚 最近在学语法。楼主你是看什么视频学习的
页:
[1]
2