小白学习python爬虫，分享一下代码（失败的代码）

来两碗米饭 发表于 2019-8-25 16:24

常识了可以获取到内容但是没有想到用什么办法保存
如果有大牛看到了能帮忙改好就谢谢了

importrequests
from lxml import etree
import time
from threading import Thread

headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

def get_html(url):
try:
   r=requests.get(url,headers=headers)
   r.encoding='gbk'
   html=etree.HTML(r.text)
   time.sleep(2)
   return html
except EnvironmentError as e:
   return e

def get_xpath(html):
href = html.xpath('//div[@class="co_content8"]//tr//td//b//a//@href')
for i in href:
   url_l='https://www.dytt8.net'+i
   html_l=get_html(url=url_l)
   data={}
   data['headline']=html_l.xpath('//div[@class="title_all"]/h1/font/text()')
   print(url_l)
   zoom = html_l.xpath('.//div[@id="Zoom"]//.//text()')
   for info in zoom:
         if info.startswith("◎译　　名"):
            version=(info.replace("◎译　　名", "").strip())
            data['version']=version
         if info.startswith("◎片　　名"):
            name=(info.replace("◎片　　名",'').strip())
            data['name']=name
         if info.startswith("◎年　　代"):
            time=(info.replace('◎年　　代','').strip())
            data['time']=time
         if info.startswith("◎产　　地"):
            place_of_origin=(info.replace('◎产　　地','').strip())
            data['place_of_origin']=place_of_origin
         if info.startswith("◎类　　别"):
            category=(info.replace('◎类　　别','').strip())
            data['category']=category
         if info.startswith('◎语　　言'):
            language=(info.replace('◎语　　言','').strip())
            data['language']=language
         if info.startswith('◎字　　幕'):
            title=(info.replace('◎字　　幕','').strip())
            data['title']=title
         if info.startswith('◎上映日期'):
            release=(info.replace('◎上映日期','').strip())
            data['release']=release
         if info.startswith('◎豆瓣评分'):
            grade=(info.replace('◎豆瓣评分','').strip())
            data['grade']=grade
         if info.startswith('◎导　　演'):
            director=(info.replace('◎导　　演','').strip())
            data['director']=director
         if info.startswith('◎主　　演'):
            protagonist=(info.replace('◎主　　演','').strip())
            data['protagonist']=protagonist
   data['lilk']=html_l.xpath('//div[@id="Zoom"]//a/@href')
   print(data)

def main(start_url,end_url):
for i in range(start_url,end_url):
   url="https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html".format(i)
   data=get_html(url=url)
   get_xpath(html=data)

if __name__ == '__main__':
thad=[]
t1=Thread(target=main,args=(1,10))
t2=Thread(target=main,args=(10,20))
t3=Thread(target=main,args=(20,30))
t4=Thread(target=main,args=(30,40))
t5=Thread(target=main,args=(40,50))
thad+=
for i in thad:
   i.start()
for i in thad:
   i.join()

你流和我流 发表于 2019-8-25 21:51

利用线程锁,保持写入数据同步
我试着保存为txt文档,没问题啊
import requests
from lxml import etree
import time
from threading import Thread
####################################
import threading
####################################
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
#####################################利用线程锁,保持写入文件同步
lock = threading.Lock()
#####################################

def get_html(url):
try:
   r = requests.get(url, headers=headers)
   r.encoding = 'gbk'
   html = etree.HTML(r.text)
   time.sleep(2)
   return html
except EnvironmentError as e:
   return e

def get_xpath(html):
href = html.xpath('//div[@class="co_content8"]//tr//td//b//a//@href')
for i in href:
   url_l = 'https://www.dytt8.net' + i
   html_l = get_html(url=url_l)
   data = {}
   data['headline'] = html_l.xpath('//div[@class="title_all"]/h1/font/text()')
   print(url_l)
   zoom = html_l.xpath('.//div[@id="Zoom"]//.//text()')
   ############################################################锁住
   lock.acquire()
   ############################################################
   for info in zoom:
         if info.startswith("◎译　　名"):
            version = (info.replace("◎译　　名", "").strip())
            data['version'] = version
         if info.startswith("◎片　　名"):
            name = (info.replace("◎片　　名", '').strip())
            data['name'] = name
         if info.startswith("◎年　　代"):
            time = (info.replace('◎年　　代', '').strip())
            data['time'] = time
         if info.startswith("◎产　　地"):
            place_of_origin = (info.replace('◎产　　地', '').strip())
            data['place_of_origin'] = place_of_origin
         if info.startswith("◎类　　别"):
            category = (info.replace('◎类　　别', '').strip())
            data['category'] = category
         if info.startswith('◎语　　言'):
            language = (info.replace('◎语　　言', '').strip())
            data['language'] = language
         if info.startswith('◎字　　幕'):
            title = (info.replace('◎字　　幕', '').strip())
            data['title'] = title
         if info.startswith('◎上映日期'):
            release = (info.replace('◎上映日期', '').strip())
            data['release'] = release
         if info.startswith('◎豆瓣评分'):
            grade = (info.replace('◎豆瓣评分', '').strip())
            data['grade'] = grade
         if info.startswith('◎导　　演'):
            director = (info.replace('◎导　　演', '').strip())
            data['director'] = director
         if info.startswith('◎主　　演'):
            protagonist = (info.replace('◎主　　演', '').strip())
            data['protagonist'] = protagonist
   data['lilk'] = html_l.xpath('//div[@id="Zoom"]//a/@href')
   ###################################################################保存你想要的数据
   with open('movice.txt','a+') asf:
         f.write('译名:'+data['version']+'\n')
         f.write('片名:'+data['name']+'\n')
         f.write('年代:'+data['time']+'\n')
         f.write('产地:'+data['place_of_origin']+'\n')
         f.write('类别:'+data['category']+'\n')
         f.write('上映时间'+data['release'])
         f.write(3*'\n')
         f.close()
   lock.release()#释放
   #####################################################################
   print(data)

def main(start_url, end_url):
for i in range(start_url, end_url):
   url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html".format(i)
   data = get_html(url=url)
   get_xpath(html=data)

if __name__ == '__main__':
thad = []
t1 = Thread(target=main, args=(1, 10))
t2 = Thread(target=main, args=(10, 20))
t3 = Thread(target=main, args=(20, 30))
t4 = Thread(target=main, args=(30, 40))
t5 = Thread(target=main, args=(40, 50))
thad +=
for i in thad:
   i.start()
for i in thad:
   i.join()

来两碗米饭 发表于 2019-8-25 17:15

你流和我流发表于 2019-8-25 17:14
with open('filename', 'a+') as f:
f.write('你想保存的数据')
......

数据有缺失值，保存不了csv。

2019ghua 发表于 2019-8-25 16:31

看得似懂非懂，好想学，我是学习的:lol

你流和我流 发表于 2019-8-25 17:14

with open('filename', 'a+') as f:
f.write('你想保存的数据')
......
保存文件即可,还可以使用csv库,pandas库,百度一下你就知道

℡小疯、 发表于 2019-8-25 17:17

你都失败了，还分享出来干吗

来两碗米饭 发表于 2019-8-25 17:20

℡小疯、发表于 2019-8-25 17:17
你都失败了，还分享出来干吗

就缺一步保存了，其他没错啊。如果只获取前十页数据，用pandas保存csv就可以了。但是我写的是全部的

空白处、 发表于 2019-8-25 18:03

吾爱破解 - 52pojie.cn's Archiver

小白学习python爬虫，分享一下代码（失败的代码）