好友
阅读权限20
听众
最后登录1970-1-1
|
本帖最后由 htcperfect 于 2021-6-22 12:42 编辑
爬取的是猫眼的数据
[Python] 纯文本查看 复制代码 #-*-coding:utf-8-*-
import requests
import os
from bs4 import BeautifulSoup
import csv
page=1 #多少页就写多少
file_name="xx影院.csv" #保存文件名
#cookie
cookie=''//请自行到浏览器上面抓取
#猫眼电影网站有反爬措施,设置headers后可以爬取 设置cookie
headers = {
'Content-Type': 'text/plain; charset=UTF-8',
'Origin':'https://maoyan.com',
'Referer':'https://maoyan.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'cookie':cookie
}
#爬取网页源代码
def get_one_page(url,headers):
try:
response =requests.get(url,headers =headers)
if response.status_code == 200:
return response.text
return None
except RequestsException:
return None
#提取影院url
def parse_one_page(html):
soup = BeautifulSoup(html, 'lxml')
url_list = soup.find_all('a', attrs={"class": "cinema-name"})
# print(img_list.title)
for tmp in url_list:
url = "https://maoyan.com"+ tmp.get('href')
# print(url)
html_info = get_one_page(url,headers)
parse_one_pageinfo(html_info)
# 影院详细信息
def parse_one_pageinfo(html):
soup = BeautifulSoup(html, 'lxml')
cinema_name = soup.find_all('h3', attrs={"class": "name text-ellipsis"})
cinema_address = soup.find_all('div', attrs={"class": "address text-ellipsis"})
cinema_phone= soup.find_all('div', attrs={"class": "telphone"})
print(cinema_name[0].string)
print(cinema_address[0].string)
print(cinema_phone[0].string)
cinema_info = [cinema_name[0].string,cinema_address[0].string, cinema_phone[0].string]
write_to_file_csv(cinema_info)
def write_to_file_csv(item):
with open(file_name, 'a', encoding='utf_8_sig',newline='') as f:
# 'a'为追加模式(添加)
# utf_8_sig格式导出csv不乱码
w = csv.writer(f)
w.writerow(item)
def main(offset):
url = "https://maoyan.com/cinemas?offset="+str(offset)
print(url)
html = get_one_page(url,headers)
if not os.path.exists('covers'):
os.mkdir('covers')
parse_one_page(html)
# for item in parse_one_page(html):
# print(item)
# write_to_file_csv(item)
# save_image_file(item['image'],'covers/'+item['title']+'.jpg')
if __name__ == '__main__':
#对每一页信息进行爬取
for i in range(page):
main(i*(10+2))
成品就不展示了 |
免费评分
-
查看全部评分
|
发帖前要善用【论坛搜索】功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。 |
|
|
|
|