本帖最后由 仿佛_一念成佛 于 2018-6-23 07:11 编辑
[Asm] 纯文本查看 复制代码 import requests
import lxml
from bs4 import BeautifulSoup
import os
import urllib.request
def get_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'}
try:
html = requests.get(url, headers=headers)
html.raise_for_status()
html.encoding = 'utf-8'
return html.text
except:
print('something is wrong')
def get_url(url):
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('div',class_="txList ")
for i in urls:
href = i.a['href']
[i]#print(href)
[/i][i] [/i]urls = 'https://www.woyaogexing.com'
baseurl = urls + href
[i]#print(baseurl)
[/i][i] #return baseurl
[/i][i] [/i]html_img = requests.get(baseurl)
html_img.encoding = 'utf-8'
[i]#print(html_img.text)
[/i][i]
[/i][i] [/i]soup_src = BeautifulSoup(html_img.text, 'lxml')
src = soup_src.find_all('li', class_="tx-img")
for a in src:
li = a.a['href']
base = 'https:' + li
[i]#print(base)
[/i][i]
[/i][i] [/i]ref = urllib.request.Request(base)
ref.add_header('user-agent', 'Mozilla/5.0')
buf = urllib.request.urlopen(ref).read()
filename = 'D:/1234/girl/' + base.split('/')[-1]
with open(filename, 'wb') as file:
file.write(buf)
print('成功保存了!')
if __name__ == '__main__':
url ='https://www.woyaogexing.com/touxiang/weixin/'#这里可改其他我要个性网的网址,翻页的话加个for循环即可,这里我就不加了,懒
get_url(url)
我只是小白,如果哪个地方还能改的话请大佬指出来,共同学习 |