import requests from lxml import etree import time import re import random from fake_useragent import UserAgent
BASET = 'https://www.ygdy8.net' pro = ['67.205.136.66:8080', '89.187.181.123:3128', '167.99.232.31:8080','89.187.181.123:3128'] #proxy=random.choice(proxy_list) ua = UserAgent()
headers = {'User-Agent':ua.random} #headers = random.choice(headerss) def get_one_url(url):
response = requests.get(url, headers=headers,proxies={'http': random.choice(pro)}) #response.encoding = 'GBK' text = response.content.decode('GB18030', 'ignore') #text = response.text html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
detail_urls = map(lambda url:BASET+url,detail_urls) return detail_urls
def parse_detail_page(url): #movis =[] response = requests.get(url,headers=headers,proxies={'http': random.choice(pro)})
text = response.content.decode('GB18030','ignore') #html = etree.HTML(text) with open('movis.txt', 'a', encoding='utf-8') as file: #titles = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()") titles = re.findall(r'<div class="title_all"><h1><font color=#07519a>(.*?)</font></h1></div>',text,re.S) #获取标题 for title in titles: #打印标题和存储标题 print(title)
file.write(title)
file.write('\n' * 1)
links = re.findall('<td.*?<a href="(.*?)">.*?</a></td>',text,re.S) #links = html.xpath('//div[@id="Zoom"]/span/table/tbody/tr/td/a//@href') for link in links: #获取此链接 #print(link) print(link)
file.write(link)
file.write('=' * 100 + '\n')
time.sleep(6)
def mrin():
na_url = 'https://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html' for x in range(1,101): #来获取页面上 url = na_url.format(x)
detail_urls = get_one_url(url) for detail_url in detail_urls: #用来获取页中所有的详情页请求链接。 movie = parse_detail_page(detail_url)
mrin()