利用线程锁,保持写入数据同步
我试着保存为txt文档,没问题啊
[Python] 纯文本查看 复制代码 import requests
from lxml import etree
import time
from threading import Thread
####################################
import threading
####################################
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
#####################################利用线程锁,保持写入文件同步
lock = threading.Lock()
#####################################
def get_html(url):
try:
r = requests.get(url, headers=headers)
r.encoding = 'gbk'
html = etree.HTML(r.text)
time.sleep(2)
return html
except EnvironmentError as e:
return e
def get_xpath(html):
href = html.xpath('//div[@class="co_content8"]//tr//td//b//a//@href')
for i in href:
url_l = 'https://www.dytt8.net' + i
html_l = get_html(url=url_l)
data = {}
data['headline'] = html_l.xpath('//div[@class="title_all"]/h1/font/text()')[0]
print(url_l)
zoom = html_l.xpath('.//div[@id="Zoom"]//.//text()')
############################################################锁住
lock.acquire()
############################################################
for info in zoom:
if info.startswith("◎译 名"):
version = (info.replace("◎译 名", "").strip())
data['version'] = version
if info.startswith("◎片 名"):
name = (info.replace("◎片 名", '').strip())
data['name'] = name
if info.startswith("◎年 代"):
time = (info.replace('◎年 代', '').strip())
data['time'] = time
if info.startswith("◎产 地"):
place_of_origin = (info.replace('◎产 地', '').strip())
data['place_of_origin'] = place_of_origin
if info.startswith("◎类 别"):
category = (info.replace('◎类 别', '').strip())
data['category'] = category
if info.startswith('◎语 言'):
language = (info.replace('◎语 言', '').strip())
data['language'] = language
if info.startswith('◎字 幕'):
title = (info.replace('◎字 幕', '').strip())
data['title'] = title
if info.startswith('◎上映日期'):
release = (info.replace('◎上映日期', '').strip())
data['release'] = release
if info.startswith('◎豆瓣评分'):
grade = (info.replace('◎豆瓣评分', '').strip())
data['grade'] = grade
if info.startswith('◎导 演'):
director = (info.replace('◎导 演', '').strip())
data['director'] = director
if info.startswith('◎主 演'):
protagonist = (info.replace('◎主 演', '').strip())
data['protagonist'] = protagonist
data['lilk'] = html_l.xpath('//div[@id="Zoom"]//a/@href')[0]
###################################################################保存你想要的数据
with open('movice.txt','a+') as f:
f.write('译名:'+data['version']+'\n')
f.write('片名:'+data['name']+'\n')
f.write('年代:'+data['time']+'\n')
f.write('产地:'+data['place_of_origin']+'\n')
f.write('类别:'+data['category']+'\n')
f.write('上映时间'+data['release'])
f.write(3*'\n')
f.close()
lock.release()#释放
#####################################################################
print(data)
def main(start_url, end_url):
for i in range(start_url, end_url):
url = "https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html".format(i)
data = get_html(url=url)
get_xpath(html=data)
if __name__ == '__main__':
thad = []
t1 = Thread(target=main, args=(1, 10))
t2 = Thread(target=main, args=(10, 20))
t3 = Thread(target=main, args=(20, 30))
t4 = Thread(target=main, args=(30, 40))
t5 = Thread(target=main, args=(40, 50))
thad += [t1, t2, t3, t4, t5]
for i in thad:
i.start()
for i in thad:
i.join() |